In [None]:
# import packages

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [None]:
# Change directory to current 

In [None]:
os.chdir('C:/Users/saurabh/Desktop/Project for github')

In [None]:
#Read the file using pandas

In [None]:
df = pd.read_csv("advertising_train.csv")

In [None]:
# explore the dataframe in following 5 cells

In [None]:
df.columns

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df.dtypes

In [None]:
# investigate the datatypes

In [None]:
df["deviceType"] = df["deviceType"].astype('category')
df["day"] = df["day"].astype('category')
df["case_id"] = df["case_id"].astype('category')
df["countryId"] = df["countryId"].astype('category')
df["companyId"] = df["companyId"].astype('category')

In [None]:
# visualize the column "deviceType" to check its distribution

In [None]:
fig, ax = plt.subplots(figsize=(16,8))
ax.scatter(df['deviceType'], df['y'])
ax.set_xlabel('deviceType')
ax.set_ylabel('y')
plt.show()

In [None]:
# Find most important features relative to target
print("Find most important features relative to target")
corr = df.corr()
corr.sort_values(["y"], ascending = False, inplace = True)
print(corr.y)

In [None]:
# As the column impression and request have low corelation with the target varible,
# we may check the corelation again if we combine both the variables

In [None]:
df['IR_Ratio'] = df['impression']/df['requests']

In [None]:
# drop the undired columns

In [None]:
df= df.drop([ 'case_id','companyId','countryId','day', 'dow', 'impression', 'requests', 'ad_area', 'ad_ratio', 'ratio5'], axis =1)

In [None]:
# remove the ouliers by keeping 1.5 standard deviation from the mean

In [None]:
def reject_outliers(data):
    u = np.mean(data)
    s = np.std(data)
    f1 = u - 1.5*s
    f2 = u + 1.5*s
    filtered = np.where(data.between(f1,f2),data, data.median())
    return filtered

df['ctr'] = reject_outliers(df['ctr'])
df['price1'] = reject_outliers(df['price1'])
df['price2'] = reject_outliers(df['price2'])
df['price3'] = reject_outliers(df['price3'] )
df['viewability'] = reject_outliers(df['viewability'] )
df['ratio1'] = reject_outliers(df['ratio1'])
df['ratio2'] = reject_outliers(df['ratio2'])
df['ratio3'] = reject_outliers(df['ratio3'])
df['ratio4'] = reject_outliers(df['ratio4'])
df['y'] = reject_outliers(df['y'])
df['cpc'] = reject_outliers(df['cpc'])
df['IR_Ratio'] = reject_outliers(df['IR_Ratio'])

In [None]:
# divide variab;es into feature and lable

In [None]:
features=df[['deviceType', 'price1', 'price2', 'price3','cpc', 'ctr', 'viewability', 'ratio1', 'ratio2', 'ratio3', 'ratio4','IR_Ratio']].values
label=df[['y']].values

In [None]:
# scale the variables

In [None]:
scaler = preprocessing.StandardScaler()
features_scaled = scaler.fit_transform(features)
label_scaled = scaler.fit_transform(label)

In [None]:
#spliting the training and testing data, keeping 90% for testing and rest for training. I have futher splitted the train 
# data into validation and training data.
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features_scaled, label_scaled, test_size = 0.05, random_state = 42)

In [None]:
# Custom class for random forest importance, which shows lowest RMSE

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

# custom function for RFI feature selection inside a pipeline
# here we use n_estimators=100
class RFIFeatureSelector(BaseEstimator, TransformerMixin):
    
    # class constructor 
    # make sure class attributes end with a "_"
    # per scikit-learn convention to avoid errors
    def __init__(self, n_features_=100):
        self.n_features_ = n_features_
        self.fs_indices_ = None

    # override the fit function
    def fit(self, X, y):
        from sklearn.ensemble import RandomForestRegressor
        from numpy import argsort
        model_rfi = RandomForestRegressor(n_estimators=10)
        model_rfi.fit(X, y)
        self.fs_indices_ = argsort(model_rfi.feature_importances_)[::-1][0:self.n_features_] 
        return self 
    
    # override the transform function
    def transform(self, X, y=None):
        return X[:, self.fs_indices_]

In [31]:
# Random forest model, which calculates root mean squared error and mean absolute error
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV


rf_regressor = RandomForestRegressor(random_state=999)

pipe_RF = Pipeline([('rfi_fs', RFIFeatureSelector()),
                    ('rf', rf_regressor)])

depths = np.arange(1, 10)



params_pipe_RF = {'rfi_fs__n_features_': [15],                  
                  'rf__max_depth': depths,
                   'rf__min_samples_split': [2,3,4,5,6]}
 


gs_pipe_RF  = GridSearchCV(estimator=pipe_RF, 
                           param_grid=params_pipe_RF, 
                           cv=5,
                           n_jobs=-1,
                           scoring= 'neg_mean_squared_error', 
                           verbose=1)

gs_pipe_RF.fit(x_train, y_train);

KeyboardInterrupt: 

In [None]:
# check the best parameters of the model and the score

In [None]:
gs_pipe_RF.best_params_

In [None]:
gs_pipe_RF.best_score_

In [None]:
# cross validation

In [None]:
from sklearn.model_selection import cross_val_score
cv_results_dt = np.sqrt(-cross_val_score(gs_pipe_RF, feature_scaled, target, scoring='neg_mean_squared_error'))
print("RMSE: %0.2f (+/- %0.2f)" % (cv_results_rf.mean(), cv_results_rf.std() * 2))

In [None]:
# since the dataset was divided in two parts i.e. train and test set, the following section 
# uses the same model on test set

In [None]:
df_deploy = pd.read_csv("advertising_test.csv", na_values=['?'])

In [None]:
# check the data type

In [None]:
df_deploy["deviceType"] = df_deploy["deviceType"].astype('category')
df_deploy["day"] = df_deploy["day"].astype('category')
df_deploy["countryId"] = df_deploy["countryId"].astype('category')
df_deploy["companyId"] = df_deploy["companyId"].astype('category')

In [None]:
# create a new feature 

In [None]:
df_deploy['IR_Ratio'] = df_deploy['impression']/df_deploy['requests']

In [None]:
# remove outliers

In [None]:
df_deploy['ctr'] = reject_outliers(df_deploy['ctr'] )
df_deploy['price1'] = reject_outliers(df_deploy['price1'])
df_deploy['price2'] = reject_outliers(df_deploy['price2'])
df_deploy['price3'] = reject_outliers(df_deploy['price3'])
df_deploy['viewability'] = reject_outliers(df_deploy['viewability'])
df_deploy['ratio1'] = reject_outliers(df_deploy['ratio1'])
df_deploy['ratio2'] = reject_outliers(df_deploy['ratio2'])
df_deploy['ratio3'] = reject_outliers(df_deploy['ratio3'])
df_deploy['ratio4'] = reject_outliers(df_deploy['ratio4'])
df_deploy['cpc'] = reject_outliers(df_deploy['cpc'])
df_deploy['IR_Ratio'] = reject_outliers(df_deploy['IR_Ratio'])

In [None]:
# create dataframe for all the required columns

In [None]:
features_df_deploy=df_deploy[['deviceType', 'price1', 'price2', 'price3','cpc', 'ctr', 'viewability', 'ratio1', 'ratio2', 'ratio3', 'ratio4','IR_Ratio']].values

In [None]:
# check the shape of the dataframe

In [None]:
features_df_deploy.shape

In [None]:
# scale the features

In [None]:
scaler = preprocessing.StandardScaler()
features_df_deploy_scaled = scaler.fit_transform(features_df_deploy)

In [None]:
# In order to visually inspect the pridicted value for y, I have created a dataframe prediction_df
prediction_df = pd.DataFrame()
prediction = rf_predictor.predict(features_df_deploy_scaled)

In [None]:
prediction_df['y'] = prediction
case_id = range(1, 1+len(prediction_df))
prediction_df.insert(0, "case_id", case_id, True) 
prediction_df.head(10)

In [None]:
prediction_df.to_csv("prediction_google_ad.csv")