In [1]:
# import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [2]:
# IMPORT AND CLEANUP DATA
#import data
techscape = pd.read_csv('train.csv')

# transform Date column into datetime type and then as float
techscape.Date = pd.to_datetime(techscape.Date).values.astype(float)

# transforming non-numerical variables into numerical, so that they can be parsed into the model
enc = LabelEncoder()
# turning OS
enc.fit(techscape['OS'])
techscape['OSn'] = enc.transform(techscape['OS'])
# turning country
enc.fit(techscape['Country'])
techscape['CountryN'] = enc.transform(techscape['Country'])
# turning type of visitor
enc.fit(techscape['Type_of_Visitor'])
techscape['Type_of_VisitorN'] = enc.transform(techscape['Type_of_Visitor'])

# drop non numerical columns
techscape = techscape.drop(['OS', 'Country', 'Type_of_Visitor'], axis=1)

# set access_id as index
techscape = techscape.set_index('Access_ID')

techscape.head()

Unnamed: 0_level_0,Date,AccountMng_Pages,AccountMng_Duration,FAQ_Pages,FAQ_Duration,Product_Pages,Product_Duration,GoogleAnalytics_BounceRate,GoogleAnalytics_ExitRate,GoogleAnalytics_PageValue,Browser,Type_of_Traffic,Buy,OSn,CountryN,Type_of_VisitorN
Access_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
102863333,1.584749e+18,0,0.0,0,0.0,3,64.0,0.0,0.0667,0.0,2,1,0,3,5,2
103117814,1.589933e+18,0,0.0,0,0.0,23,684.5,0.0217,0.0449,0.0,2,6,0,6,1,2
103201891,1.60704e+18,0,0.0,0,0.0,8,95.0,0.025,0.0583,0.0,4,1,0,6,3,2
103226087,1.608595e+18,0,0.0,0,0.0,9,608.75,0.0,0.025,42.4225,2,2,1,6,8,2
103234445,1.606176e+18,0,0.0,2,386.0,36,1609.9397,0.0,0.0093,12.5033,2,3,1,6,6,2


In [3]:
# SCALE DATA

scaler = MinMaxScaler()
techscapeScaled = scaler.fit_transform(techscape)
techscapeScaled = pd.DataFrame(techscapeScaled)

techscapeScaled.columns = techscape.columns
techscapeScaled

Unnamed: 0,Date,AccountMng_Pages,AccountMng_Duration,FAQ_Pages,FAQ_Duration,Product_Pages,Product_Duration,GoogleAnalytics_BounceRate,GoogleAnalytics_ExitRate,GoogleAnalytics_PageValue,Browser,Type_of_Traffic,Buy,OSn,CountryN,Type_of_VisitorN
0,0.146707,0.000000,0.000000,0.000000,0.00000,0.004255,0.001000,0.0000,0.3335,0.000000,0.083333,0.000000,0.0,0.428571,0.625,1.0
1,0.326347,0.000000,0.000000,0.000000,0.00000,0.032624,0.010700,0.1085,0.2245,0.000000,0.083333,0.357143,0.0,0.857143,0.125,1.0
2,0.919162,0.000000,0.000000,0.000000,0.00000,0.011348,0.001485,0.1250,0.2915,0.000000,0.250000,0.000000,0.0,0.857143,0.375,1.0
3,0.973054,0.000000,0.000000,0.000000,0.00000,0.012766,0.009516,0.0000,0.1250,0.117266,0.083333,0.071429,1.0,0.857143,1.000,1.0
4,0.889222,0.000000,0.000000,0.083333,0.15141,0.051064,0.025166,0.0000,0.0465,0.034562,0.083333,0.142857,1.0,0.857143,0.750,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9994,0.290419,0.111111,0.046782,0.083333,0.02177,0.032624,0.017197,0.0000,0.0555,0.000000,0.083333,0.928571,0.0,0.428571,0.000,1.0
9995,0.500000,0.222222,0.051519,0.000000,0.00000,0.004255,0.000431,0.0000,0.0555,0.000000,0.750000,0.285714,0.0,0.857143,0.750,0.0
9996,0.152695,0.000000,0.000000,0.000000,0.00000,0.038298,0.010067,0.0385,0.2595,0.000000,0.083333,0.142857,0.0,0.428571,0.125,1.0
9997,0.314371,0.000000,0.000000,0.000000,0.00000,0.075177,0.011184,0.1130,0.1815,0.000000,0.083333,0.142857,0.0,0.857143,0.375,1.0


In [4]:
# SPLIT DATA AND RUN MODEL
#split data
target = techscapeScaled['Buy']
data = techscapeScaled.drop(['Buy'],axis=1)
X_train, X_test, y_train, y_test = train_test_split(data,target, test_size=0.2, random_state=5, stratify=target)

# create model
log_model = LogisticRegression()
# fit the model to train data
log_model.fit(X_train, y_train)

LogisticRegression()

In [5]:
# calculate F1 score

y_pred = log_model.predict(X_test)

f1 = f1_score(y_test, y_pred)
f1

0.3627450980392157

In [6]:
test_data = pd.read_csv('test.csv')

# cleanup dataset
# transforming Date column into datetime type and then as float
test_data.Date = pd.to_datetime(test_data.Date).values.astype(float)

# transforming non-numerical variables into numerical,  so that they can be parsed into the model
enc = LabelEncoder()
# turning OS
enc.fit(test_data['OS'])
test_data['OSn'] = enc.transform(test_data['OS'])
# turning country
enc.fit(test_data['Country'])
test_data['CountryN'] = enc.transform(test_data['Country'])
# turning type of visitor
enc.fit(test_data['Type_of_Visitor'])
test_data['Type_of_VisitorN'] = enc.transform(test_data['Type_of_Visitor'])

#dropping non-numerical columns
test_data = test_data.drop(['OS', 'Country', 'Type_of_Visitor'], axis=1)

# set access_id as index
test_data = test_data.set_index('Access_ID')

# scale data
scaler = MinMaxScaler()
test_data_scaled = scaler.fit_transform(test_data)
test_data_scaled = pd.DataFrame(test_data_scaled)

test_data_scaled.columns = test_data.columns

test_data_scaled.head()

Unnamed: 0,Date,AccountMng_Pages,AccountMng_Duration,FAQ_Pages,FAQ_Duration,Product_Pages,Product_Duration,GoogleAnalytics_BounceRate,GoogleAnalytics_ExitRate,GoogleAnalytics_PageValue,Browser,Type_of_Traffic,OSn,CountryN,Type_of_VisitorN
0,0.613772,0.0,0.0,0.0,0.0,0.161049,0.288182,0.0695,0.327,0.0,0.083333,0.0,0.857143,0.0,1.0
1,0.341317,0.086957,0.013784,0.0,0.0,0.102996,0.076765,0.006,0.029,0.0,0.0,0.214286,0.0,0.75,1.0
2,0.823353,0.0,0.0,0.0,0.0,0.067416,0.042612,0.0,0.125,0.0,0.416667,0.142857,0.857143,0.75,1.0
3,0.308383,0.0,0.0,0.0,0.0,0.003745,0.037235,0.0,0.5,0.0,0.083333,0.071429,0.857143,0.75,0.0
4,0.832335,0.521739,0.082797,0.083333,0.0,0.241573,0.181962,0.007,0.0925,0.004004,0.25,0.642857,0.857143,0.625,1.0


In [7]:
y_pred = log_model.predict(test_data_scaled)
y_pred.mean()

0.07565217391304348

In [16]:
predictions = log_model.predict(test_data_scaled)
buy = np.around(predictions,0)
buy = buy.astype(int)

test_data['Buy'] = buy
test_data.reset_index(inplace=True)
test_data

Unnamed: 0,level_0,index,Access_ID,Date,AccountMng_Pages,AccountMng_Duration,FAQ_Pages,FAQ_Duration,Product_Pages,Product_Duration,GoogleAnalytics_BounceRate,GoogleAnalytics_ExitRate,GoogleAnalytics_PageValue,Browser,Type_of_Traffic,OSn,CountryN,Type_of_VisitorN,Buy
0,0,0,798519314,1.598227e+18,0,0.00,0,0.000,86,5332.5478,0.0139,0.0654,0.0000,2,1,6,0,2,0
1,1,1,798663101,1.590365e+18,2,37.50,0,0.000,55,1420.4725,0.0012,0.0058,0.0000,1,4,0,6,2,0
2,2,2,798663221,1.604275e+18,0,0.00,0,0.000,36,788.5000,0.0000,0.0250,0.0000,6,3,6,6,2,0
3,3,3,798760918,1.589414e+18,0,0.00,0,0.000,2,689.0000,0.0000,0.1000,0.0000,2,2,6,6,0,0
4,4,4,798856982,1.604534e+18,12,225.25,1,0.000,129,3367.0413,0.0014,0.0185,1.0353,4,10,6,5,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2295,2295,2295,962042726,1.606435e+18,10,720.50,2,88.725,220,10580.3448,0.0048,0.0221,0.0000,2,1,6,5,2,0
2296,2296,2296,962063064,1.584662e+18,0,0.00,0,0.000,3,55.0000,0.0667,0.1000,0.0000,2,3,3,6,2,0
2297,2297,2297,962147749,1.606003e+18,2,45.50,0,0.000,22,535.0000,0.0000,0.0250,9.6503,2,11,6,6,2,0
2298,2298,2298,962182167,1.605744e+18,8,166.70,0,0.000,9,171.7000,0.0167,0.0500,0.0000,2,3,6,5,2,0


In [17]:
submission = test_data.copy()
submission = submission[['Access_ID', 'Buy']]
submission
submission.to_csv('logsubmission1.csv', index=False)

Unnamed: 0,Access_ID,Buy
0,798519314,0
1,798663101,0
2,798663221,0
3,798760918,0
4,798856982,0
...,...,...
2295,962042726,0
2296,962063064,0
2297,962147749,0
2298,962182167,0


In [18]:
test_data.Buy.unique()

array([0, 1])