In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the dataset
data = pd.read_csv('Social_Media_Advertising.csv',nrows=8000)

# Handle missing values
missing_values = data.isnull().sum()
print("Missing values:\n", missing_values)

# Convert 'Acquisition_Cost' column to float
data['Acquisition_Cost'] = data['Acquisition_Cost'].replace('[\\$,]', '', regex=True).astype(float)

# Drop irrelevant columns
data.drop(columns=['Date', 'Company'], inplace=True)

# Convert categorical variables into numerical format using one-hot encoding
data_encoded = pd.get_dummies(data, columns=['Target_Audience', 'Campaign_Goal', 'Duration', 'Location', 'Language', 'Customer_Segment'])

# Define features (X) and target variable (y)
X = data_encoded.drop(columns=['Channel_Used'])  # Features
y = data_encoded['Channel_Used']  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ensure that both datasets have the same columns after preprocessing
X_test = X_test[X_train.columns]

# Display the shapes of the training and testing sets
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)


Missing values:
 Campaign_ID         0
Target_Audience     0
Campaign_Goal       0
Duration            0
Channel_Used        0
Conversion_Rate     0
Acquisition_Cost    0
ROI                 0
Location            0
Language            0
Clicks              0
Impressions         0
Engagement_Score    0
Customer_Segment    0
Date                0
Company             0
dtype: int64
Training set shape: (6400, 34) (6400,)
Testing set shape: (1600, 34) (1600,)


In [33]:
# Train a Random Forest classifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Evaluate the classifier
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)


Model Accuracy: 0.515625


In [34]:

# Predict the best platform for a new product
def predict_best_platform(product_attributes):
    # Preprocess the new product attributes
    product_attributes_encoded = pd.get_dummies(pd.DataFrame(product_attributes, index=[0]))
    # Get the relevant features used during training
    product_attributes_relevant = product_attributes_encoded[X_train.columns]
    # Predict the best platform using the trained classifier
    predicted_platform = clf.predict(product_attributes_relevant)
    return predicted_platform[0]


In [29]:
# Example usage:
new_product_attributes = {
    'Target_Audience_Men 35-44': 1,
    'Campaign_Goal_Product Launch': 1,
    'Duration_15 Days': 1,
    'Location_Las Vegas': 1,
    'Language_Spanish': 1,
    'Customer_Segment_Health': 1,
    # Include only relevant features here
}


In [35]:
recommended_platform = predict_best_platform(new_product_attributes)
print("Recommended Platform:", recommended_platform)

KeyError: "['Campaign_ID', 'Conversion_Rate', 'Acquisition_Cost', 'ROI', 'Clicks', 'Impressions', 'Engagement_Score', 'Target_Audience_All Ages', 'Target_Audience_Men 18-24', 'Target_Audience_Men 25-34', 'Target_Audience_Men 45-60', 'Target_Audience_Women 18-24', 'Target_Audience_Women 25-34', 'Target_Audience_Women 35-44', 'Target_Audience_Women 45-60', 'Campaign_Goal_Brand Awareness', 'Campaign_Goal_Increase Sales', 'Campaign_Goal_Market Expansion', 'Location_Austin', 'Location_Los Angeles', 'Location_Miami', 'Location_New York', 'Language_English', 'Language_French', 'Customer_Segment_Fashion', 'Customer_Segment_Food', 'Customer_Segment_Home', 'Customer_Segment_Technology'] not in index"