In [5]:
# import libraries
import pandas as pd 
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from regressors import stats

In [6]:
# import and cleanup dataset

# import dataset
data = pd.read_csv('train.csv')

# cleanup dataset
# transforming Date column into datetime type and then as float
data.Date = pd.to_datetime(data.Date).values.astype(float)

# transforming non-numerical variables into numerical,  so that they can be parsed into the model
enc = LabelEncoder()
# turning OS
enc.fit(data['OS'])
data['OSn'] = enc.transform(data['OS'])
# turning country
enc.fit(data['Country'])
data['CountryN'] = enc.transform(data['Country'])
# turning type of visitor
enc.fit(data['Type_of_Visitor'])
data['Type_of_VisitorN'] = enc.transform(data['Type_of_Visitor'])

#dropping non-numerical columns
data = data.drop(['OS', 'Country', 'Type_of_Visitor'], axis=1)

# set access_id as index
data = data.set_index('Access_ID')

data.head()

Unnamed: 0_level_0,Date,AccountMng_Pages,AccountMng_Duration,FAQ_Pages,FAQ_Duration,Product_Pages,Product_Duration,GoogleAnalytics_BounceRate,GoogleAnalytics_ExitRate,GoogleAnalytics_PageValue,Browser,Type_of_Traffic,Buy,OSn,CountryN,Type_of_VisitorN
Access_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
102863333,1.584749e+18,0,0.0,0,0.0,3,64.0,0.0,0.0667,0.0,2,1,0,3,5,2
103117814,1.589933e+18,0,0.0,0,0.0,23,684.5,0.0217,0.0449,0.0,2,6,0,6,1,2
103201891,1.60704e+18,0,0.0,0,0.0,8,95.0,0.025,0.0583,0.0,4,1,0,6,3,2
103226087,1.608595e+18,0,0.0,0,0.0,9,608.75,0.0,0.025,42.4225,2,2,1,6,8,2
103234445,1.606176e+18,0,0.0,2,386.0,36,1609.9397,0.0,0.0093,12.5033,2,3,1,6,6,2


In [7]:
# DATA PARTITION
X = data.drop(columns=['Buy'])
y = data['Buy']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=15)

# Model creation
lin_model = LinearRegression()
lin_model.fit(X_train, y_train)
predictions = lin_model.predict(X_val)

# running regressor to evaluate model
xlabels = X_train.columns
stats.summary(clf = lin_model, X = X_train, y = y_train, xlabels = xlabels)

Residuals:
   Min     1Q  Median     3Q    Max
0.0074 0.0615  0.1061 0.1643 3.4218


Coefficients:
                            Estimate          Std. Error          t value   p value
_intercept                 -5.227565  0.639344+0.000000j  -8.1765+0.0000j  0.000000
Date                        0.000000  0.000000+0.000000j   0.0000-0.0000j  1.000000
AccountMng_Pages            0.003397  0.001418+0.000000j   2.3961-0.0000j  0.016592
AccountMng_Duration         0.000017  0.000021+0.000000j   0.8051-0.0000j  0.420778
FAQ_Pages                   0.000172  0.003736+0.000000j   0.0461-0.0000j  0.963241
FAQ_Duration                0.000004  0.000028+0.000000j   0.1385-0.0000j  0.889852
Product_Pages               0.000325  0.000165+0.000000j   1.9648-0.0000j  0.049469
Product_Duration            0.000012  0.000002+0.000000j   5.9768-0.0000j  0.000000
GoogleAnalytics_BounceRate -0.000013  0.152242+0.000000j  -0.0001+0.0000j  0.999932
GoogleAnalytics_ExitRate   -0.000017  0.163129+0.000000j  -0.

In [10]:
test_data = pd.read_csv('test.csv')

# cleanup dataset
# transforming Date column into datetime type and then as float
test_data.Date = pd.to_datetime(test_data.Date).values.astype(float)

# transforming non-numerical variables into numerical,  so that they can be parsed into the model
enc = LabelEncoder()
# turning OS
enc.fit(test_data['OS'])
test_data['OSn'] = enc.transform(test_data['OS'])
# turning country
enc.fit(test_data['Country'])
test_data['CountryN'] = enc.transform(test_data['Country'])
# turning type of visitor
enc.fit(test_data['Type_of_Visitor'])
test_data['Type_of_VisitorN'] = enc.transform(test_data['Type_of_Visitor'])

#dropping non-numerical columns
test_data = test_data.drop(['OS', 'Country', 'Type_of_Visitor'], axis=1)

# set access_id as index
test_data = test_data.set_index('Access_ID')

test_data.head()

Unnamed: 0_level_0,Date,AccountMng_Pages,AccountMng_Duration,FAQ_Pages,FAQ_Duration,Product_Pages,Product_Duration,GoogleAnalytics_BounceRate,GoogleAnalytics_ExitRate,GoogleAnalytics_PageValue,Browser,Type_of_Traffic,OSn,CountryN,Type_of_VisitorN
Access_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
798519314,1.598227e+18,0,0.0,0,0.0,86,5332.5478,0.0139,0.0654,0.0,2,1,6,0,2
798663101,1.590365e+18,2,37.5,0,0.0,55,1420.4725,0.0012,0.0058,0.0,1,4,0,6,2
798663221,1.604275e+18,0,0.0,0,0.0,36,788.5,0.0,0.025,0.0,6,3,6,6,2
798760918,1.589414e+18,0,0.0,0,0.0,2,689.0,0.0,0.1,0.0,2,2,6,6,0
798856982,1.604534e+18,12,225.25,1,0.0,129,3367.0413,0.0014,0.0185,1.0353,4,10,6,5,2


In [11]:
predictions = lin_model.predict(test_data)
# buy = np.round(predictions,0)

test_data['Buy'] = predictions
test_data

In [None]:
test_data['Buy'].loc[test_data['Buy'] > 1]

In [None]:
test_data.Buy.unique()

In [None]:
submission = test_data.copy()
submission = submission[['Access_ID', 'Buy']]
submission.to_csv('linsubmission1.csv', index=False)