In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
from scipy.stats import probplot
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRFRegressor
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRFRegressor, XGBRegressor
import catboost as ctb
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.preprocessing import StandardScaler

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/akhil14shukla/Summer-of-Analytics-IITG-Project/master/Train_Data.csv")
test=pd.read_csv("https://raw.githubusercontent.com/akhil14shukla/Summer-of-Analytics-IITG-Project/master/Test_Data.csv")

In [None]:
df.shape

## Understanding the Data

In [None]:
fig, axes = plt.subplots(4, 4, figsize=(25, 20))
for i,ax in zip(df.columns,axes.flat):
    sns.histplot(df[i],ax=ax)
#     probplot(df[i],dist='norm',plot=ax)
#     probplot(np.log((df.iloc[:,i])),dist='norm',plot=ax)
    ax.set_title(i)
plt.subplots_adjust(wspace=0.2, hspace=0.5)
plt.show()

In [None]:
sns.scatterplot(df['adgroup'],df['revenue'])

## Data Cleaning and Transformation

Performed label encoding, dropped the columns (_campaign_ had same value for all the tuples), created new features, which might be useful while building a model.

In [None]:
le = LabelEncoder()
# lb = LabelBinarizer()
df['adgroup'] = le.fit_transform(df['adgroup'])
test['adgroup']=le.transform(test['adgroup'])
# df = df.join(pd.DataFrame(lb.fit_transform(df['adgroup'])))
# test = test.join(pd.DataFrame(lb.transform(test['adgroup'])))
# df.drop(['adgroup'],axis=1)
# test.drop(['adgroup'],axis=1)
df = df.set_index(df['date'])
test = test.set_index(test['date'])

df["impressions"] = np.log(df["impressions"])
df["clicks"] = np.log(df["clicks"])
df["cost"] = np.log(df["cost"])

test["impressions"] = np.log(test["impressions"])
test["clicks"] = np.log(test["clicks"])
test["cost"] = np.log(test["cost"])

df.drop(['date', 'campaign'],axis=1,inplace=True)
test.drop(['date','campaign'],axis=1,inplace=True)


Removed the "ad " from _ad_ column and converted the rest to numerical type

In [None]:
for i in range(len(df)):
    df['ad'][i] = int(df['ad'][i][3:])
for i in range(len(test)):
    test['ad'][i]=int(test['ad'][i][3:])
df['ad']=pd.to_numeric(df['ad'])
test['ad']=pd.to_numeric(test['ad'])
df.drop(['ad'],axis=1,inplace=True)

In [None]:
# sns.pairplot(df,hue="revenue")

Created new features, and while doing so, we created infinite or NaN values unknowingly. So, replaced those values, as these are not accepted by models. <br>
replaced -infinity with 0 and infinity and NaN with a number greater than the current maximum (randomly chose 65, it could be 100, 10000,...). We just need to preserve the essence of _-infinity_ and _infinity_

In [None]:
df['CTR']=df['clicks']/df['impressions']
df['CPC']=df['cost']/df['clicks']
df['CPA']=df['cost']/df['conversions']

test['CTR']=test['clicks']/test['impressions']
test['CPC']=test['cost']/test['clicks']
test['CPA']=test['cost']/test['conversions']

df['CPC'].fillna(df.CPC.interpolate(),inplace=True)
df['CPA'].fillna(df.CPA.interpolate(),inplace=True)

test['CPC'].fillna(test.CPC.interpolate(),inplace=True)
test['CPA'].fillna(test.CPA.interpolate(),inplace=True)

df.replace([-np.inf], 0,inplace=True)
test.replace([-np.inf], 0,inplace=True) 
df.replace([np.inf,np.nan], 65,inplace=True)
test.replace([np.inf,np.nan], 65,inplace=True)

In [None]:
sns.heatmap(df.corr(),cmap="YlGnBu")

In [None]:
sns.pairplot(df)

**Created a custom function for finding Predictive Power Score. Check out the [repository here](https://github.com/akhil14shukla/PyCustom).**

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import *
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn import metrics               
from sklearn import preprocessing
from sklearn import utils
from sklearn.metrics import mean_absolute_error


def pps(df,categorical_features=None,numerical_features=None):
    columns = ["Feature", "Target","PPS", "Type of Prediction", "Cross-Val Score", "Training Score","Naive-Baseline Score", "Model"]
    pps_df=pd.DataFrame(columns=columns)
    for i in df.columns:
        for j in df.columns:
            if(((categorical_features)!=None and j in categorical_features) or type(df[j])==str):
                model = DecisionTreeClassifier()
                lab_enc = preprocessing.LabelEncoder()
                label_encoded_y = np.array(lab_enc.fit_transform(df[j])).reshape(-1,1)
                x_train,x_test,y_train,y_test = train_test_split(np.array(df[i]).reshape(-1,1),label_encoded_y)
                model.fit(x_train,y_train)
                y_pred=model.predict(x_test)
                f1 = metrics.f1_score(y_test,y_pred)
                mode = np.full((len(x_test),1),df[j].mode())
                f1_naive = metrics.f1_score(y_test,mode)
                pps_score=max(0,(f1-f1_naive)/(1 - f1_naive))
                cv_score=model.score(x_test,y_test)
                train_score=model.score(x_train,y_train)
                pps_df = pps_df.append({"Feature":i,"Target":j,"PPS":pps_score,"Type of Prediction":"Classification","Cross-Val Score":cv_score,"Training Score":train_score,"Naive-Baseline Score":f1_naive,"Model":"DecisionTreeClassifier()"},ignore_index=True)
            else:
                model = DecisionTreeRegressor()
                # lab_enc = preprocessing.LabelEncoder()
                # label_encoded_y = np.array(lab_enc.fit_transform(df[j])).reshape(-1,1)
                x_train,x_test,y_train,y_test = train_test_split(np.array(df[i]).reshape(-1,1),np.array(df[j]).reshape(-1,1))
                model.fit(x_train,y_train)
                median = np.full((len(x_test),1),df[j].median())
                naive_mae = metrics.mean_absolute_error(y_test, median)
                y_pred=model.predict(x_test)
                mae=metrics.mean_absolute_error(y_test,y_pred)
                train_score=mean_absolute_error(y_train,model.predict(x_train))
                pps_score = max(0,1 - mae/naive_mae)
                pps_df=pps_df.append({"Feature":i,"Target":j,"PPS":pps_score,"Type of Prediction":"Regression","Cross-Val Score":mae,"Training Score":train_score,"Naive-Baseline Score":naive_mae,"Model":"DecisionTreeRegressor()"},ignore_index=True)
    
    return pps_df

In [None]:
pf = pps(df)

In [None]:
pf[pf["Target"]=="revenue"]

Used the below library, to ease working with Standard Scaler

In [None]:
# df = np.array(df)
from sklearn_pandas import DataFrameMapper

## Building the model
I tried various models CatBoost, XGBoost, RandomForest, XGB turned out to be the best.<br><br>
Thinking behind building the model:<br>
Created 4 model for each _adgroup_. As each _adgroup_ might have different distribution or different approach of advertising, which might generate different revenues. To do that, first grouped them based on _adgroup_, then created 4 models, trained them on each _adgroup_(4 in this case) one by one using _for_ loop.<br> 
But before doing that we need to scale the data, for better performance. Used _StandardScaler()_, so any remaining outliers will be taken care of. But as we are training different models, I decided to use different scaling for each _adgroup_. It seems a lot of work but it was finally worth it.

In [None]:
g = df.groupby('adgroup')

# model0 = XGBRFRegressor(n_estimators=10,max_depth=6, min_child_weight=2,subsample=0,learning_rate = 0.3)
# model0 = lg(kernel='rbf', degree=2, gamma='auto', C = 0.1)
j=6
# model0 = ctb.CatBoostRegressor(random_state=0,verbose=False,depth=4,l2_leaf_reg=5,learning_rate=0.4, n_estimators=10000)
# model1 = ctb.CatBoostRegressor(random_state=0,verbose=False,depth=6,l2_leaf_reg=3,learning_rate=0.01, n_estimators=10000,)
# model2 = ctb.CatBoostRegressor(random_state=0,verbose=False,depth=5,l2_leaf_reg=8,learning_rate=0.03, n_estimators=10000)
# model3 = ctb.CatBoostRegressor(random_state=0,verbose=False,depth=5,l2_leaf_reg=8,learning_rate=0.01, n_estimators=10000)

model0 = XGBRFRegressor(n_estimators=4000)
model1 = XGBRFRegressor(n_estimators=4000)
model2 = XGBRFRegressor(n_estimators=4000)
model3 = XGBRFRegressor(n_estimators=4000)

df0 = pd.DataFrame(g.get_group(0))
y0 = df0['revenue']
df0.drop(['revenue'],axis=1,inplace=True)
mapper0 = DataFrameMapper([(df0.columns, StandardScaler())])
scaled_features = mapper0.fit_transform(df0.copy())
df0 = pd.DataFrame(scaled_features, columns=df0.columns)

df1 = pd.DataFrame(g.get_group(1))
y1 = df1['revenue']
df1.drop(['revenue'],axis=1,inplace=True)
mapper1 = DataFrameMapper([(df1.columns, StandardScaler())])
scaled_features = mapper1.fit_transform(df1.copy())
df1 = pd.DataFrame(scaled_features, columns=df1.columns)

df2 = pd.DataFrame(g.get_group(2))
y2 = df2['revenue']
df2.drop(['revenue'],axis=1,inplace=True)
mapper2 = DataFrameMapper([(df2.columns, StandardScaler())])
scaled_features = mapper2.fit_transform(df2.copy())
df2 = pd.DataFrame(scaled_features, columns=df2.columns)

df3 = pd.DataFrame(g.get_group(3))
y3 = df3['revenue']
df3.drop(['revenue'],axis=1,inplace=True)
mapper3 = DataFrameMapper([(df3.columns, StandardScaler())])
scaled_features = mapper3.fit_transform(df3.copy())
df3 = pd.DataFrame(scaled_features, columns=df3.columns)

# data[f"df{i}"]

for i in range(4):
    # d_train,d_test,y1_train,y1_test = train_test_split(df,y,random_state=0)
    grid = {'learning_rate': [0.01, 0.03, 0.1],
        'depth': [4, 6, 8],
        'l2_leaf_reg': [0.5, 1, 3],
        'iteration' : [1500,2000]}
    if(i==0):
        d_train,d_test,y1_train,y1_test = train_test_split(df0,y0,random_state=0)
        model0.fit(d_train,y1_train)
        # grid_search_result = model0.grid_search(grid, 
        #                                X=d_train, 
        #                                y=y1_train, 
        #                                verbose=False,refit=True, search_by_train_test_split=True)
        y_pred=model0.predict(d_test)
        
    elif (i==1):
        d_train,d_test,y1_train,y1_test = train_test_split(df1,y1,random_state=0)
        model1.fit(d_train,y1_train)
        # grid_search_result = model1.grid_search(grid, 
        #                                X=d_train, 
        #                                y=y1_train, 
        #                                verbose=False,refit=True, search_by_train_test_split=True)
        y_pred=model1.predict(d_test)
    elif (i==2):
        d_train,d_test,y1_train,y1_test = train_test_split(df2,y2,random_state=0)
        model2.fit(d_train,y1_train)
        # grid_search_result = model2.grid_search(grid, 
        #                                X=d_train, 
        #                                y=y1_train, 
        #                                verbose=False,refit=True, search_by_train_test_split=True)
        y_pred=model2.predict(d_test)
    elif (i==3):
        d_train,d_test,y1_train,y1_test = train_test_split(df3,y3,random_state=0)
        model3.fit(d_train,y1_train)
        # grid_search_result = model3.grid_search(grid, 
        #                                X=d_train, 
        #                                y=y1_train, 
        #                                verbose=False,refit=True, search_by_train_test_split=True) 
        y_pred=model3.predict(d_test)
    
    for i in range(len(d_test)):
        if(y_pred[i]<0):
            y_pred[i] = 0
    rms = mean_squared_error(y1_test, y_pred, squared=False)
    print(rms)


## Predicting
Predicting the values for Test Dataset. <br><br>
Iterated through each value of test data set. and depending on its _adgroup_, applied corresponding scaling (using the model created before this cell), and predicted the value using the corresponding model.

In [None]:
y_pred=[]
# x_test.drop('revenue',axis=1,inplace=True)
x_test = test
for i in range(len(x_test)):
    if(x_test['adgroup'][i]==0):
        curr = mapper0.transform(pd.DataFrame(x_test.iloc[i]).transpose())
        y_pred = np.append(y_pred,model0.predict(curr)[0])
    elif (x_test['adgroup'][i]==1):
        curr = mapper1.transform(pd.DataFrame(x_test.iloc[i]).transpose())
        y_pred = np.append(y_pred,model1.predict(curr)[0])
    elif (x_test['adgroup'][i]==2):
        curr = mapper2.transform(pd.DataFrame(x_test.iloc[i]).transpose())
        y_pred = np.append(y_pred,model2.predict(curr)[0])
    elif (x_test['adgroup'][i]==3):
        curr = mapper3.transform(pd.DataFrame(x_test.iloc[i]).transpose())
        y_pred = np.append(y_pred,model3.predict(curr)[0])


Model might predict negative revenue in some cases, to take care of that, replaced negative values with 0.

In [None]:
for i in range(len(y_pred)):
    if(y_pred[i]<0):
        y_pred[i]=0

In [None]:
y_pred.shape# checking the shape of the final array

## Exporting to CSV

In [None]:
pd.DataFrame({'revenue':y_pred}).to_csv("submission_4.csv") # Exporting to CSV file