In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
%matplotlib inline

In [2]:
ndata = pd.read_csv("NextData.csv")

In [3]:
ndata = ndata.drop(['Advertiser', 'Advertiser ID', 'Advertiser Status',
       'Advertiser Integration Code',
       'Advertiser Currency'], axis=1)

In [4]:
ndata.columns = ['TOD', 'Country', 'Device', 'Impressions', 'mImpressions', 'vImpressions', 'Clicks','Spend']

In [5]:
ndata["CTR"] = ndata.Clicks / ndata.Impressions
ndata["Viewability"] = ndata.vImpressions / ndata.mImpressions

In [6]:
ndata = ndata[ndata.Device.isnull() == False]

In [7]:
ndata.Country = ndata.Country.astype("category")
ndata.Device = ndata.Device.astype("category")

---



In [8]:
#craete dependent variable matrix X and independent variable y
#here importantly we need to transform the dataframe into array of values before it can be used
X = ndata.loc[:, ["TOD","Viewability", "Device",]].values
y = ndata.loc[:, "CTR"].values

In [9]:
#replacing missing data with mean
from sklearn.preprocessing import Imputer

In [10]:
#firstly we define the imputer object with Imputer and the criteria
#firstly fit the imputer to X dataset which will save imputer object with X mean etc. value in it
imputer = Imputer(missing_values="NaN", strategy="mean", axis = 0)
imputer = imputer.fit(X[:, 0:2])

In [11]:
#then we set X equals to X being transformed by using the data saved imputer object to X
X[:, 0:2] = imputer.transform(X[:, 0:2])

In [12]:
#now we fix categorical data and transform that to dummy arrays
#import LabelEncoder to transform categorical data to numerical
#import OneHotEncoder to then take the numerical and make that to dummy with columns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [13]:
#firstly use LabelEncoder to fit and transform non-numerical data to numerical
labelencoder_x = LabelEncoder()
X[:, 2] = labelencoder_x.fit_transform(X[:, 2])

In [14]:
#then use OneHotEncoder to fit and transform the numerical data to dummy 
#OneHotEncoder syntax the parameter categorical_features=[g], g is the column we like to create dummy against
dummy = OneHotEncoder(categorical_features=[2])
X = dummy.fit_transform(X).toarray()

In [15]:
from sklearn.cross_validation import train_test_split

In [16]:
#below syntax with above cross_validation is to use to split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 0)

In [17]:
#import StandardScaler this is to standardise the dataset 
from sklearn.preprocessing import StandardScaler

In [18]:
sc = StandardScaler()

In [19]:
#firstly fir the standard scaler to the training dataset
X_train = sc.fit_transform(X_train)
#as sc is now fit with X training set, there is no need to fit it again, just use it to transform test set
X_test = sc.transform(X_test)

--- 



In [21]:
from sklearn.linear_model import LinearRegression

In [22]:
regressor = LinearRegression()

In [23]:
regressor_ = regressor.fit(X_train, y_train)

In [24]:
y_pred = regressor_.predict(X_test)

---

In [None]:
#------------Backward elimination-------------

In [27]:
#here we go through the process of backward elimination to build an optimal model which takes in only the optimsal independent variables
#import the statsmodel here to use its OLS to check 
import statsmodels.formula.api as sm

In [43]:
#one thing to bear in mind is that OLS doesn't take into account constant 
#we need to therefore build a column of 1s for it factor in constant
#trick: here we reverse the arguments in append
#by append X to the value instead of value to X we get the 1s to the first column
#np.ones() output is an array, so use astype(int) to convert that into a list of values so X can append to it
X = np.append(arr=np.ones((len(X[:,0]), 1)).astype(int), values = X, axis=1)

In [59]:
#now we can through the process of fitting OLS to our independent variables
#firstly we create an object to represent the dataset X instead of using X directly
#this is important in the sense that so when we take out the each independent variables we can keep track of them
X_opt = X[:, [1, 2, 3, 4, 5, 6]]
#then we fit the OLS model to the dataset
regressor_summary = sm.OLS(endog = y, exog = X_opt).fit()

In [60]:
#then we examine the summary of it
regressor_summary.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.132
Model:,OLS,Adj. R-squared:,0.121
Method:,Least Squares,F-statistic:,12.57
Date:,"Sun, 16 Apr 2017",Prob (F-statistic):,1.56e-09
Time:,17:03:00,Log-Likelihood:,1926.1
No. Observations:,336,AIC:,-3842.0
Df Residuals:,331,BIC:,-3823.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.0008,0.000,3.622,0.000,0.000 0.001
x1,7.479e-07,9.58e-05,0.008,0.994,-0.000 0.000
x2,0.0011,0.000,8.355,0.000,0.001 0.001
x3,-0.0003,0.000,-1.932,0.054,-0.001 5.83e-06
x4,-1.623e-07,6.3e-06,-0.026,0.979,-1.25e-05 1.22e-05
x5,0.0007,0.001,1.150,0.251,-0.001 0.002

0,1,2,3
Omnibus:,336.547,Durbin-Watson:,2.14
Prob(Omnibus):,0.0,Jarque-Bera (JB):,13081.91
Skew:,4.19,Prob(JB):,0.0
Kurtosis:,32.397,Cond. No.,5.55e+16


In [64]:
X_opt = X[:, [1, 3]]
regressor_summary = sm.OLS(endog = y, exog = X_opt).fit()
regressor_summary.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.121
Model:,OLS,Adj. R-squared:,0.119
Method:,Least Squares,F-statistic:,46.06
Date:,"Sun, 16 Apr 2017",Prob (F-statistic):,5.25e-11
Time:,17:04:15,Log-Likelihood:,1924.1
No. Observations:,336,AIC:,-3844.0
Df Residuals:,334,BIC:,-3836.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.0011,4.48e-05,25.348,0.000,0.001 0.001
x1,0.0011,0.000,6.786,0.000,0.001 0.001

0,1,2,3
Omnibus:,331.807,Durbin-Watson:,2.16
Prob(Omnibus):,0.0,Jarque-Bera (JB):,12295.611
Skew:,4.114,Prob(JB):,0.0
Kurtosis:,31.47,Cond. No.,3.9


In [65]:
X

array([[ 1.        ,  1.        ,  1.        , ...,  0.        ,
         0.        ,  0.26607072],
       [ 1.        ,  1.        ,  1.        , ...,  0.        ,
         0.        ,  0.39137441],
       [ 1.        ,  1.        ,  1.        , ...,  0.        ,
         1.        ,  0.25305692],
       ..., 
       [ 1.        ,  1.        ,  0.        , ...,  0.        ,
         9.        ,  0.41634366],
       [ 1.        ,  1.        ,  1.        , ...,  0.        ,
         9.        ,  0.46174584],
       [ 1.        ,  1.        ,  1.        , ...,  0.        ,
         9.        ,  0.4109987 ]])