In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('/kaggle/input/clicks-conversion-tracking/KAG_conversion_data.csv')

In [None]:
df.head()

In [None]:
df.info() # No Null values present in the dataset

In [None]:
df.describe()

In [None]:
df.corr()

In [None]:
df['gender'].value_counts()

In [None]:
plt.figure(figsize = (12,8))
gender_count = sns.countplot(data = df,x = 'gender')
gender_count.set_title("Gender wise count", fontsize = 25, weight = 1000)
gender_count.set_xlabel("Gender")
gender_count.set_ylabel("Counts")
gender_count.set_xticklabels(['Male', 'Female']);


In [None]:
df['interest'].value_counts()

In [None]:
plt.figure(figsize = (16,6))
interest_count = sns.countplot(data = df, x = 'interest', palette='cividis')
interest_count.set_title("Interest counts", fontsize = 25, weight = 1000)
interest_count.set_ylabel('Counts')
interest_count.set_xlabel("Interest", fontsize = 15, weight = 1000)


In [None]:
group_by_age = df.groupby('age').sum().drop(['ad_id', 'xyz_campaign_id', 'fb_campaign_id', 'interest'], axis = 1  )

In [None]:
group_by_age.reset_index(inplace=True)
group_by_age

In [None]:
plt.figure(figsize=(10,4), dpi = 300)
age_clicks = sns.barplot(data = group_by_age, x = 'age', y = 'Clicks', palette='magma')
age_clicks.set_title("Age wise Clicks Distribution", fontsize = 15, weight = 1000)
age_clicks.set_xlabel("Age Groups", fontsize = 10, weight = 1000)
age_clicks.set_ylabel("No. of Clicks", fontsize = 10, weight = 1000);

In [None]:
plt.figure(figsize=(10, 4), dpi = 300)
age_clicks = sns.barplot(data = group_by_age, x = 'age', y = 'Total_Conversion', palette='Spectral_r')
age_clicks.set_title("Age wise Total Conversion", fontsize = 15, weight = 1000)
age_clicks.set_xlabel("Age Groups", fontsize = 10, weight = 1000)
age_clicks.set_ylabel("Total Conversion", fontsize = 10, weight = 1000);

In [None]:
plt.figure(figsize=(10, 4), dpi = 300)
age_clicks = sns.barplot(data = group_by_age, x = 'age', y = 'Approved_Conversion', palette='Spectral')
age_clicks.set_title("Age wise Approved Conversion", fontsize = 15, weight = 1000)
age_clicks.set_xlabel("Age Groups", fontsize = 10, weight = 1000)
age_clicks.set_ylabel("Approved Conversion", fontsize = 10, weight = 1000);

In [None]:
plt.figure(figsize=(10, 4), dpi = 300)
age_clicks = sns.barplot(data = group_by_age, x = 'age', y = 'Spent', palette='ocean')
age_clicks.set_title("Age wise Amount Spent", fontsize = 15, weight = 1000)
age_clicks.set_xlabel("Age Groups", fontsize = 10, weight = 1000)
age_clicks.set_ylabel("Amount Spent", fontsize = 10, weight = 1000);

In [None]:
df

In [None]:
group_by_xyz_campaign_id = df.groupby('xyz_campaign_id').sum().drop(['ad_id', 'fb_campaign_id', 'interest'], axis = 1)

In [None]:
group_by_xyz_campaign_id.reset_index(inplace=True)
group_by_xyz_campaign_id['Campaign_alias'] = ['Campaign A (916)', 'Campaign B (936)', 'Campaign C (1178)']
group_by_xyz_campaign_id

In [None]:
plt.figure(figsize=(12,6))

ax1 = sns.barplot(data = group_by_xyz_campaign_id, x = 'Campaign_alias', y = 'Clicks')

ax2 = ax1.twinx()

ax2 = sns.lineplot(data = group_by_xyz_campaign_id, x = 'Campaign_alias', y = 'Spent', color = 'black', lw = 5, ls = '--')

In [None]:
plt.figure(figsize=(12,6))

ax1 = sns.barplot(data = group_by_xyz_campaign_id, x = 'Campaign_alias', y = 'Approved_Conversion')

ax2 = ax1.twinx()

ax2 = sns.lineplot(data = group_by_xyz_campaign_id, x = 'Campaign_alias', 
                   y = 'Total_Conversion', color = 'black', lw = 5, ls = '--')

In [None]:
plt.figure(figsize=(12,6))

ax1 = sns.barplot(data = group_by_xyz_campaign_id, x = 'Campaign_alias', y = 'Spent')

ax2 = ax1.twinx()

ax2 = sns.lineplot(data = group_by_xyz_campaign_id, x = 'Campaign_alias', 
                   y = 'Approved_Conversion', color = 'black', lw = 5, ls = '--')

In [None]:
group_by_xyz_campaign_id.info()

In [None]:
fig,axes = plt.subplots(nrows=2,ncols=2,figsize=(16,8))


axes[0][0].plot(group_by_xyz_campaign_id['Campaign_alias'], group_by_xyz_campaign_id['Clicks'], 
                color = 'Red', lw = 5)
axes[0][0].set_title('Campaign wise Clicks trend', fontweight = 1000)
axes[0][0].set_ylabel('No. of Clicks')


axes[0][1].plot(group_by_xyz_campaign_id['Campaign_alias'], group_by_xyz_campaign_id['Spent'], 
                color = 'Green', lw = 5)
axes[0][1].set_title('Campaign wise Spent trend', fontweight = 1000)
axes[0][1].set_ylabel('Amount Spent')

axes[1][0].plot(group_by_xyz_campaign_id['Campaign_alias'], group_by_xyz_campaign_id['Total_Conversion'], 
                color = 'Black', lw = 5)
axes[1][0].set_title('Campaign wise Total Conversion trend', fontweight = 1000)
axes[1][0].set_ylabel("Total Conversions")

axes[1][1].plot(group_by_xyz_campaign_id['Campaign_alias'], group_by_xyz_campaign_id['Approved_Conversion'],
               color = 'Purple', lw = 5)
axes[1][1].set_title("Campaign wise Approved Conversions trend", fontweight = 1000)
axes[1][1].set_ylabel('Approved Conversions')

fig.subplots_adjust(wspace=0.25, hspace=0.35,)


In [None]:
#Modeling
df['xyz_campaign_id'].value_counts()

In [None]:
df.head()

In [None]:
df = df.drop(['ad_id', 'fb_campaign_id', 'xyz_campaign_id'], axis = 1)


df.head()

In [None]:
df['age'].unique()

In [None]:
#Coding the age groups 

age_code_list = []

for x in range(len(df)): 
    if df['age'].iloc[x] == '30-34':
        age_code_list.append(1)
        
    elif df['age'].iloc[x] == '35-39':
        age_code_list.append(2)
    
    elif df['age'].iloc[x] == '40-44':
        age_code_list.append(3)
        
    else:
        age_code_list.append(4)
        

df['age_code'] = age_code_list
        
        

In [None]:
df

In [None]:
#coding the gender

gender_code_list = []

for y in range(len(df)): 
    if df['gender'].iloc[y] == 'M':
        gender_code_list.append(1)
    else:
        gender_code_list.append(2)
        
        

df['gender_code'] = gender_code_list
    

In [None]:
df = df.drop(['age', 'gender'], axis =1 )

In [None]:
df.head()

In [None]:
X = df.drop('Approved_Conversion', axis =1 )
y = df['Approved_Conversion']

In [None]:
X.head()

In [None]:
y.head()

In [None]:
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state= 101)

In [None]:
linear_model = LinearRegression()

In [None]:
linear_model.fit(X_train, y_train)

In [None]:
test_predictions = linear_model.predict(X_test)

In [None]:
residuals = y_test - test_predictions

In [None]:
sns.scatterplot(x = y_test, y = residuals)
plt.axhline(y=0, color = 'red')
#there is no pattern in the residuals plot

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
MAE = mean_absolute_error(y_test, test_predictions)

In [None]:
RMSE = np.sqrt(mean_squared_error(y_test, test_predictions))

In [None]:
r2_score(y_test, test_predictions)

In [None]:
MAE

In [None]:
RMSE

In [None]:
linear_model.score(X_test, y_test)

In [None]:
#Polynomial Regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
#we will check the optimum degree of polynomial to be used

In [None]:
train_rmse_errors = []
test_rmse_errors = []

for d in range(1,10):
    poly_converter = PolynomialFeatures(degree = d, include_bias= False)
    
    poly_features = poly_converter.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(poly_features, y, test_size= 0.3, random_state= 101)
    
    linear_model = LinearRegression()
    
    linear_model.fit(X_train, y_train)
    
    test_predictions = linear_model.predict(X_test)
    train_predictions = linear_model.predict(X_train)
    
    test_rmse = np.sqrt(mean_squared_error(y_test, test_predictions))
    train_rmse = np.sqrt(mean_squared_error(y_train, train_predictions))
    
    test_rmse_errors.append(test_rmse)
    train_rmse_errors.append(train_rmse)
    
    

In [None]:
plt.plot(range(1,4), train_rmse_errors[:3], label = 'Train RMSE')

plt.plot(range(1,4), test_rmse_errors[:3], label = 'Test RMSE')
plt.xlabel("Degree")
plt.ylabel("RMSE")
plt.legend();

#we will select degree = 1 which is essentially linear regression

In [None]:
#ELASTIC NET regularization

In [None]:
from sklearn.linear_model import ElasticNetCV

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state= 101)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
scaler.fit(X_train)

In [None]:
X_train = scaler.transform(X_train)

In [None]:
X_test = scaler.transform(X_test)

In [None]:
elastic_model = ElasticNetCV(l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1], eps= 0.001, n_alphas= 100, max_iter= 1000000)

In [None]:
elastic_model.fit(X_train, y_train)

In [None]:
elastic_model.l1_ratio_ #optimum ratio

In [None]:
test_predictions = elastic_model.predict(X_test)

In [None]:
MAE = mean_absolute_error(y_test, test_predictions)
RMSE = np.sqrt(mean_squared_error(y_test, test_predictions))

In [None]:
MAE

In [None]:
RMSE

In [None]:
r2_score(y_test, test_predictions)

In [None]:
'''
Linear Regression gives a slightly better RMSE value than ElasticNetCV model
'''