In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd '/content/drive/My Drive/UF/Intro to Data Science/Project/Modelling'

/content/drive/My Drive/UF/Intro to Data Science/Project/Modelling


In [3]:
%ls

11152019_test_final.csv  11152019_train_final.csv  linear_clf.pickle
11152019_test_final.zip  11152019_train_final.zip  modelling_corr.ipynb


In [0]:
import numpy as np
import pandas as pd
import sklearn.svm as svm
from sklearn import preprocessing
import pickle

In [5]:
train = pd.read_csv("11152019_train_final.csv")
test = pd.read_csv("11152019_test_final.csv")
train.shape, test.shape

((1537503, 42), (361429, 42))

In [6]:
train.columns

Index(['channelGrouping', 'device.browser', 'device.deviceCategory',
       'device.isMobile', 'device.operatingSystem', 'fullVisitorId',
       'geoNetwork.city', 'geoNetwork.continent', 'geoNetwork.country',
       'geoNetwork.networkDomain', 'geoNetwork.region',
       'geoNetwork.subContinent', 'totals.bounces', 'totals.hits',
       'totals.newVisits', 'totals.pageviews', 'totals.sessionQualityDim',
       'totals.timeOnSite', 'totals.totalTransactionRevenue',
       'totals.transactionRevenue', 'totals.transactions',
       'trafficSource.adContent',
       'trafficSource.adwordsClickInfo.adNetworkType',
       'trafficSource.adwordsClickInfo.gclId',
       'trafficSource.adwordsClickInfo.page',
       'trafficSource.adwordsClickInfo.slot', 'trafficSource.campaign',
       'trafficSource.isTrueDirect', 'trafficSource.keyword',
       'trafficSource.medium', 'trafficSource.referralPath',
       'trafficSource.source', 'visitId', 'visitNumber', 'visitStartTime',
       'new.returni

In [7]:
train.isna().sum(axis=0)

channelGrouping                                 0
device.browser                                  0
device.deviceCategory                           0
device.isMobile                                 0
device.operatingSystem                          0
fullVisitorId                                   0
geoNetwork.city                                 0
geoNetwork.continent                            0
geoNetwork.country                              0
geoNetwork.networkDomain                        0
geoNetwork.region                               0
geoNetwork.subContinent                         0
totals.bounces                                  0
totals.hits                                     0
totals.newVisits                                0
totals.pageviews                                0
totals.sessionQualityDim                        0
totals.timeOnSite                               0
totals.totalTransactionRevenue                  0
totals.transactionRevenue                       0


In [8]:
y = train['new.returningCustomer']
X = train.drop(columns=['new.returningCustomer'])
keep_cols = ['totals.bounces',
             'totals.hits',
             'totals.newVisits',
             'totals.pageviews',
             'totals.sessionQualityDim',
             'totals.timeOnSite',
             'trafficSource.isTrueDirect',
             'visitNumber',
             'country_dem',
             'latitude',
             'longitude']

X = X[keep_cols]

X.columns

Index(['totals.bounces', 'totals.hits', 'totals.newVisits', 'totals.pageviews',
       'totals.sessionQualityDim', 'totals.timeOnSite',
       'trafficSource.isTrueDirect', 'visitNumber', 'country_dem', 'latitude',
       'longitude'],
      dtype='object')

In [9]:
y.name

'new.returningCustomer'

In [0]:
def preprocess(df):

  y = df['new.returningCustomer']
  X = df.drop(columns=['new.returningCustomer'])
  keep_cols = ['totals.bounces',
              'totals.hits',
              'totals.newVisits',
              'totals.pageviews',
              'totals.sessionQualityDim',
              'totals.timeOnSite',
              'trafficSource.isTrueDirect',
              'visitNumber',
              'country_dem',
              'latitude',
              'longitude']

  X = X[keep_cols]

  return X,y

def encode(X):

  lb = preprocessing.LabelBinarizer()
  X['trafficSource.isTrueDirect'] = lb.fit_transform(X['trafficSource.isTrueDirect'])

  # Not in keep_cols
  # X['device.isMobile'] = lb.transform(X['device.isMobile'])

  # Not in keep_cols
  # X['visitStartTime'].dtype == np.int64

  ignoreColumns = {''}
  for column in X.columns:
      if (X[column].dtype != np.int64 and X[column].dtype != np.float64):
          print(f"Converting column: {column}")

          le = preprocessing.LabelEncoder()
          X[column] = le.fit_transform(X[column])
          del le

  return X

def scale(X):
  scaler = preprocessing.StandardScaler()
  X[X.columns] = scaler.fit_transform(X[X.columns])
  return X

In [0]:
X, y = preprocess(train)
X = encode(X)
X = scale(X)

In [12]:
X.shape, y.shape

((1537503, 11), (1537503,))

In [13]:
X.dtypes

totals.bounces                float64
totals.hits                   float64
totals.newVisits              float64
totals.pageviews              float64
totals.sessionQualityDim      float64
totals.timeOnSite             float64
trafficSource.isTrueDirect    float64
visitNumber                   float64
country_dem                   float64
latitude                      float64
longitude                     float64
dtype: object

In [14]:
linear_svc = svm.LinearSVC(verbose=1)
linear_svc

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=1)

In [15]:
linear_svc.fit(X, y)

[LibLinear]



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=1)

In [16]:
linear_svc.score(X, y)

0.888315665075125

In [17]:
X_test, y_test = preprocess(test)
X_test.head()

Unnamed: 0,totals.bounces,totals.hits,totals.newVisits,totals.pageviews,totals.sessionQualityDim,totals.timeOnSite,trafficSource.isTrueDirect,visitNumber,country_dem,latitude,longitude
0,-1.0,4,0.0,3.0,1,973.0,True,2,26043,20.593684,78.96288
1,-1.0,4,0.0,3.0,1,49.0,True,166,162791,37.09024,-95.712891
2,-1.0,4,0.0,3.0,1,24.0,True,2,162791,37.09024,-95.712891
3,-1.0,5,0.0,4.0,1,25.0,True,4,162791,37.09024,-95.712891
4,-1.0,5,1.0,4.0,1,49.0,False,1,162791,37.09024,-95.712891


In [18]:
X_test = encode(X_test)
X_test.dtypes

totals.bounces                float64
totals.hits                     int64
totals.newVisits              float64
totals.pageviews              float64
totals.sessionQualityDim        int64
totals.timeOnSite             float64
trafficSource.isTrueDirect      int64
visitNumber                     int64
country_dem                     int64
latitude                      float64
longitude                     float64
dtype: object

In [19]:
X_test = scale(X_test)
X_test.describe()

Unnamed: 0,totals.bounces,totals.hits,totals.newVisits,totals.pageviews,totals.sessionQualityDim,totals.timeOnSite,trafficSource.isTrueDirect,visitNumber,country_dem,latitude,longitude
count,361429.0,361429.0,361429.0,361429.0,361429.0,361429.0,361429.0,361429.0,361429.0,361429.0,361429.0
mean,8.440292e-16,-3.907622e-15,-2.566713e-15,-8.597852e-15,-7.528747e-15,2.846723e-15,-1.193545e-14,-1.636408e-15,3.051309e-14,-9.451451e-15,1.187592e-14
std,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001
min,-0.9139619,-0.4559527,-1.573631,-0.495416,-0.2977721,-0.3847782,-0.7659207,-0.1385967,-1.013661,-3.998602,-1.74732
25%,-0.9139619,-0.4559527,-1.573631,-0.495416,-0.2977721,-0.3847782,-0.7659207,-0.1385967,-0.9488859,-0.3342659,-0.8531999
50%,-0.9139619,-0.3481594,0.6354729,-0.3414586,-0.2977721,-0.3586454,-0.7659207,-0.1385967,-0.6753372,0.2214188,-0.6045244
75%,1.094137,0.08301387,0.6354729,0.1204136,-0.2977721,-0.1077706,1.305618,-0.04550683,1.101223,0.2214188,0.6668346
max,1.094137,53.33291,0.6354729,76.32933,7.250969,41.75174,1.305618,48.45431,1.101223,2.094501,2.45235


In [20]:
linear_svc.score(X_test, y_test)

0.8637713077810579

In [0]:
with open("linear_clf.pickle", "wb") as file_handle:
  pickle.dump(linear_svc, file_handle, protocol=pickle.HIGHEST_PROTOCOL)

In [0]:
!ls

11152019_test_final.csv  11152019_train_final.csv  linear_clf.pickle
11152019_test_final.zip  11152019_train_final.zip  modelling_corr.ipynb


In [0]:
with open("linear_clf.pickle", "rb") as file_handle:
  clf = pickle.load(file_handle)

clf == linear_svc

False