In [2]:
import numpy as np
import pandas as pd 
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

df = pd.read_csv("Fbdata.csv")
df

Unnamed: 0,ad_id,reporting_start,reporting_end,campaign_id,fb_campaign_id,age,gender,interest1,interest2,interest3,impressions,clicks,spent,total_conversion,approved_conversion
0,708746,17/08/2017,17/08/2017,916,103916,30-34,M,15,17,17,7350.000000,1,1.43,2.0,1.0
1,708749,17/08/2017,17/08/2017,916,103917,30-34,M,16,19,21,17861.000000,2,1.82,2.0,0.0
2,708771,17/08/2017,17/08/2017,916,103920,30-34,M,20,25,22,693.000000,0,0.00,1.0,0.0
3,708815,30/08/2017,30/08/2017,916,103928,30-34,M,28,32,32,4259.000000,1,1.25,1.0,0.0
4,708818,17/08/2017,17/08/2017,916,103928,30-34,M,28,33,32,4133.000000,1,1.29,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1138,1314410,19/08/2017,19/08/2017,45-49,F,109,111,114,1129773,252,358.189997,13,2.00,,
1139,1314411,19/08/2017,19/08/2017,45-49,F,110,111,116,637549,120,173.880003,3,0.00,,
1140,1314412,19/08/2017,19/08/2017,45-49,F,111,113,117,151531,28,40.289999,2,0.00,,
1141,1314414,17/08/2017,17/08/2017,45-49,F,113,114,117,790253,135,198.710001,8,2.00,,


In [4]:
df.dtypes

ad_id                    int64
reporting_start         object
reporting_end           object
campaign_id             object
fb_campaign_id          object
age                     object
gender                  object
interest1                int64
interest2                int64
interest3                int64
impressions            float64
clicks                   int64
spent                  float64
total_conversion       float64
approved_conversion    float64
dtype: object

In [6]:
missing_values = df.isnull().sum()
missing_values

ad_id                    0
reporting_start          0
reporting_end            0
campaign_id              0
fb_campaign_id           0
age                      0
gender                   0
interest1                0
interest2                0
interest3                0
impressions              0
clicks                   0
spent                    0
total_conversion       382
approved_conversion    382
dtype: int64

In [8]:
df['total_conversion'] = df['total_conversion'].fillna(df['total_conversion'].mean())
df['approved_conversion'] = df['approved_conversion'].fillna(df['approved_conversion'].mean())

In [10]:
# Convert date columns to datetime 
df['reporting_start'] = pd.to_datetime(df['reporting_start'], format='%d/%m/%Y')
df['reporting_end'] = pd.to_datetime(df['reporting_end'], format='%d/%m/%Y')

In [12]:
# Calculate Campaign Duration
df['campaign_duration'] = (df['reporting_end'] - df['reporting_start']).dt.days
df['campaign_duration'].head()

0    0
1    0
2    0
3    0
4    0
Name: campaign_duration, dtype: int64

In [14]:
# Calculate Engagement Rate, avoiding division by zero
df['engagement_rate'] = df['clicks'] / df['impressions'].replace(0, 1e-10)
df['engagement_rate'].head()

0    0.000136
1    0.000112
2    0.000000
3    0.000235
4    0.000242
Name: engagement_rate, dtype: float64

In [16]:
duplicates = df.duplicated().sum()
duplicates

0

In [18]:
# Remove duplicate rows
data_unique = df.drop_duplicates()
data_unique.head()

Unnamed: 0,ad_id,reporting_start,reporting_end,campaign_id,fb_campaign_id,age,gender,interest1,interest2,interest3,impressions,clicks,spent,total_conversion,approved_conversion,campaign_duration,engagement_rate
0,708746,2017-08-17,2017-08-17,916,103916,30-34,M,15,17,17,7350.0,1,1.43,2.0,1.0,0,0.000136
1,708749,2017-08-17,2017-08-17,916,103917,30-34,M,16,19,21,17861.0,2,1.82,2.0,0.0,0,0.000112
2,708771,2017-08-17,2017-08-17,916,103920,30-34,M,20,25,22,693.0,0,0.0,1.0,0.0,0,0.0
3,708815,2017-08-30,2017-08-30,916,103928,30-34,M,28,32,32,4259.0,1,1.25,1.0,0.0,0,0.000235
4,708818,2017-08-17,2017-08-17,916,103928,30-34,M,28,33,32,4133.0,1,1.29,1.0,1.0,0,0.000242


In [20]:
# Check for NaN or infinite values
print("Checking for NaN values:")
print(df.isna().sum())

print("Checking for infinite values:")
print((df == float('inf')).sum())

Checking for NaN values:
ad_id                  0
reporting_start        0
reporting_end          0
campaign_id            0
fb_campaign_id         0
age                    0
gender                 0
interest1              0
interest2              0
interest3              0
impressions            0
clicks                 0
spent                  0
total_conversion       0
approved_conversion    0
campaign_duration      0
engagement_rate        0
dtype: int64
Checking for infinite values:
ad_id                  0
reporting_start        0
reporting_end          0
campaign_id            0
fb_campaign_id         0
age                    0
gender                 0
interest1              0
interest2              0
interest3              0
impressions            0
clicks                 0
spent                  0
total_conversion       0
approved_conversion    0
campaign_duration      0
engagement_rate        0
dtype: int64


In [22]:
df.drop(columns=['campaign_id', 'fb_campaign_id','reporting_start','reporting_end'], inplace=True)


In [24]:
# Encode categorical variables
label_encoder = LabelEncoder()
df['gender'] = label_encoder.fit_transform(df['gender'])
df['age'] = label_encoder.fit_transform(df['age'])

In [26]:
df.dtypes

ad_id                    int64
age                      int32
gender                   int32
interest1                int64
interest2                int64
interest3                int64
impressions            float64
clicks                   int64
spent                  float64
total_conversion       float64
approved_conversion    float64
campaign_duration        int64
engagement_rate        float64
dtype: object