# Introduction
This challenge is the capstone project of the Summer Analytics, a primer course on Data Science, conducted by Consulting and Analytics Club of IIT Guwahati in the summers.

The dataset is provided by DeltaX is the pioneering cross-channel digital advertising platform. The cloud-based platform leverages big data, user behavior, and machine learning algorithms to improve performance across the business funnel of advertisers.

# Data Description

date: the date on which the ad was made live

campaign: campaign number

adgroup: adgroup number

ad: ad number

impressions - Number of time the ad was shown

clicks - Number of time the ad clicked shown

cost - Amount spent to show ad

conversions - Number of transactions received

revenue: revenue generated from the ad

# importing modules

In [1]:
import pandas as pd
import matplotlib
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,8)

# Reading and exploring dataset

In [3]:
df = pd.read_csv("Train_Data.csv")
col_to_be_used = ['impressions','clicks','cost','conversions']
df.sample()

Unnamed: 0,date,campaign,adgroup,ad,impressions,clicks,cost,conversions,revenue
2751,16-12-2020,campaign 1,adgroup 1,ad 32,22,19,0.23,0,0.0


In [4]:
x= df.iloc[:,1:-1 ].values
y=df.iloc[:,8].values
print(x)

[['campaign 1' 'adgroup 1' 'ad 1' ... 6 0.08 0]
 ['campaign 1' 'adgroup 2' 'ad 1' ... 0 0.0 0]
 ['campaign 1' 'adgroup 3' 'ad 1' ... 4 0.04 0]
 ...
 ['campaign 1' 'adgroup 3' 'ad 56' ... 8 0.12 1]
 ['campaign 1' 'adgroup 1' 'ad 56' ... 13 0.23 0]
 ['campaign 1' 'adgroup 1' 'ad 55' ... 10 0.14 0]]


# Feature Transformation and Scaling

In [5]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
df[col_to_be_used] = pd.DataFrame(scaler.fit_transform(df[col_to_be_used]))


In [6]:
df.head(10)

Unnamed: 0,date,campaign,adgroup,ad,impressions,clicks,cost,conversions,revenue
0,01-08-2020,campaign 1,adgroup 1,ad 1,0.102564,-0.027778,-0.099237,0.0,0.0
1,01-08-2020,campaign 1,adgroup 2,ad 1,-0.192308,-0.194444,-0.160305,0.0,0.0
2,01-08-2020,campaign 1,adgroup 3,ad 1,-0.038462,-0.083333,-0.129771,0.0,0.0
3,01-08-2020,campaign 1,adgroup 4,ad 1,-0.141026,-0.083333,-0.099237,0.0,0.0
4,01-08-2020,campaign 1,adgroup 1,ad 2,2.961538,3.305556,0.824427,4.0,925.71
5,01-08-2020,campaign 1,adgroup 2,ad 2,-0.115385,-0.138889,-0.129771,0.0,0.0
6,01-08-2020,campaign 1,adgroup 4,ad 2,-0.128205,-0.111111,-0.122137,0.0,0.0
7,01-08-2020,campaign 1,adgroup 1,ad 3,1.448718,1.25,0.282443,0.0,0.0
8,01-08-2020,campaign 1,adgroup 3,ad 3,1.128205,1.361111,0.267176,0.0,0.0
9,01-08-2020,campaign 1,adgroup 4,ad 3,-0.128205,-0.055556,-0.122137,0.0,0.0


In [None]:
# encoding data
from sklearn.preprocessing import LabelEncoder
# categorical_col = ['campaign','adgroup','ad']
lb_make = LabelEncoder()
df['campaign'] = lb_make.fit_transform(df['campaign'])

In [8]:
df['adgroup'] = lb_make.fit_transform(df['adgroup'])
# df['ad'] = lb_make.fit_transform(df['ad'])


In [41]:
y= df.iloc[:,-1]
df= df.iloc[:,1:-1]


In [10]:
type(df)

pandas.core.frame.DataFrame

In [11]:
categorical_col = ['campaign','adgroup']

new_data= pd.get_dummies(df,columns=categorical_col)

In [12]:
new_data= new_data.iloc[:,1:-1]
new_data

Unnamed: 0,impressions,clicks,cost,conversions,campaign_0,adgroup_0,adgroup_1,adgroup_2
0,0.102564,-0.027778,-0.099237,0.0,1,1,0,0
1,-0.192308,-0.194444,-0.160305,0.0,1,0,1,0
2,-0.038462,-0.083333,-0.129771,0.0,1,0,0,1
3,-0.141026,-0.083333,-0.099237,0.0,1,0,0,0
4,2.961538,3.305556,0.824427,4.0,1,1,0,0
...,...,...,...,...,...,...,...,...
4566,0.038462,-0.027778,-0.106870,0.0,1,0,0,1
4567,-0.192308,-0.194444,-0.160305,0.0,1,0,0,0
4568,0.000000,0.027778,-0.068702,1.0,1,0,0,1
4569,0.269231,0.166667,0.015267,0.0,1,1,0,0


In [13]:
X = new_data.iloc[:, :].values
y = y.values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
X_train.shape

(3656, 8)

In [14]:
from sklearn.linear_model import LinearRegression
regressor_lr = LinearRegression()
regressor_lr.fit(X_train, y_train)
print(X_train.shape)

(3656, 8)


In [15]:
y_pred = regressor_lr.predict(X_test)


In [16]:
from sklearn import metrics

print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

MAE: 97.152679639334
MSE: 74731.66870728649
RMSE: 273.370936105663


In [17]:
from sklearn.tree import DecisionTreeRegressor
regressor_dtree = DecisionTreeRegressor(random_state = 0)
regressor_dtree.fit(X_train, y_train)
print(X_train.shape)


(3656, 8)


In [18]:
dtree_pred = regressor_dtree.predict(X_test)

In [19]:

print('MAE:', metrics.mean_absolute_error(y_test, dtree_pred))
print('MSE:', metrics.mean_squared_error(y_test, dtree_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, dtree_pred)))

MAE: 90.90495081967212
MSE: 167692.5325863388
RMSE: 409.50278703122257


In [23]:
from sklearn.ensemble import GradientBoostingRegressor

regressor_gboost = GradientBoostingRegressor(n_estimators=500)
regressor_gboost.fit(X_train,y_train)
print(X_train.shape)


(3656, 8)


In [24]:
ypred_gboostreg = regressor_gboost.predict(X_test)

In [25]:
print('MAE:', metrics.mean_absolute_error(y_test, ypred_gboostreg))
print('MSE:', metrics.mean_squared_error(y_test, ypred_gboostreg))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, ypred_gboostreg)))

MAE: 70.52854052110197
MSE: 92066.87439664488
RMSE: 303.4252369145403


In [26]:
df_new = pd.read_csv('Test_Data.csv')
df_new.head()

Unnamed: 0,date,campaign,adgroup,ad,cost,impressions,clicks,conversions
0,01-03-2021,campaign 1,adgroup 1,ad 1,0.58,121,49,1
1,01-03-2021,campaign 1,adgroup 3,ad 1,0.17,22,12,0
2,01-03-2021,campaign 1,adgroup 4,ad 1,0.05,5,3,0
3,01-03-2021,campaign 1,adgroup 2,ad 1,0.01,2,1,0
4,01-03-2021,campaign 1,adgroup 2,ad 2,0.01,3,1,0


In [27]:
new_data1 = df_new.iloc[:,1:]

new_col_to_be_used = ['cost','impressions','clicks','conversions']

from sklearn.preprocessing import RobustScaler
scaler2 = RobustScaler()
new_data1[new_col_to_be_used] = pd.DataFrame(scaler2.fit_transform(new_data1[new_col_to_be_used]))


In [28]:
from sklearn.preprocessing import LabelEncoder
# categorical_col = ['campaign','adgroup','ad']
lb_make2 = LabelEncoder()
new_data1['campaign'] = lb_make2.fit_transform(new_data1['campaign'])
new_data1['adgroup'] = lb_make2.fit_transform(new_data1['adgroup'])
new_data1['ad'] = lb_make2.fit_transform(new_data1['ad'])



In [29]:
new_data1.head()

Unnamed: 0,campaign,adgroup,ad,cost,impressions,clicks,conversions
0,0,0,0,1.010638,1.020202,1.032258,1.0
1,0,2,0,0.138298,0.020202,0.077419,0.0
2,0,3,0,-0.117021,-0.151515,-0.154839,0.0
3,0,1,0,-0.202128,-0.181818,-0.206452,0.0
4,0,1,2,-0.202128,-0.171717,-0.206452,0.0


In [30]:
categorical_col = ['campaign','adgroup']

new_data2= pd.get_dummies(new_data1,columns=categorical_col)
new_data2= new_data2.iloc[:,:-1]



In [31]:
new_data2=new_data2.iloc[:,1:]

In [32]:
new_data2

Unnamed: 0,cost,impressions,clicks,conversions,campaign_0,adgroup_0,adgroup_1,adgroup_2
0,1.010638,1.020202,1.032258,1.0,1,1,0,0
1,0.138298,0.020202,0.077419,0.0,1,0,0,1
2,-0.117021,-0.151515,-0.154839,0.0,1,0,0,0
3,-0.202128,-0.181818,-0.206452,0.0,1,0,1,0
4,-0.202128,-0.171717,-0.206452,0.0,1,0,1,0
...,...,...,...,...,...,...,...,...
313,-0.202128,-0.191919,-0.206452,0.0,1,0,0,0
314,-0.202128,-0.151515,-0.206452,0.0,1,0,0,1
315,0.074468,0.050505,0.051613,0.0,1,1,0,0
316,-0.095745,-0.111111,-0.129032,0.0,1,0,0,1


In [33]:
X_new1 = new_data2.iloc[:, :].values
final_result = regressor_gboost.predict(X_new1)

In [34]:
import csv

In [35]:
f = open('final.csv','w')

In [36]:
writer = csv.writer(f)

In [37]:
type(final_result)

numpy.ndarray

In [38]:
arr = pd.DataFrame(final_result)

In [39]:
arr.to_csv("final_result2.csv")