In [4]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random
from sklearn.metrics import mean_squared_error

In [5]:
# import data sets

processed_train_df = pd.read_csv("/Users/ying/2018Fall/CS539/Project/CS539_ML-master-2/processed_train_df.csv", dtype={'fullVisitorId': 'str'})
processed_test_df = pd.read_csv("/Users/ying/2018Fall/CS539/Project/CS539_ML-master-2/processed_test_df.csv", dtype={'fullVisitorId': 'str'})

In [6]:
# drop the index column

processed_train_df = processed_train_df.drop('Unnamed: 0', axis=1)
processed_test_df = processed_test_df.drop('Unnamed: 0', axis=1)

print("Train DataFrame Shape: " + str(processed_train_df.shape))
processed_train_df.head()

Train DataFrame Shape: (903653, 22)


Unnamed: 0,fullVisitorId,totals.transactionRevenue,num.totals.hits,num.totals.pageviews,num.visitNumber,cat.date,cat.visitStartTime,cat.totals.bounces,cat.totals.newVisits,cat.channelGrouping,...,cat.geoNetwork.country,cat.geoNetwork.metro,cat.geoNetwork.networkDomain,cat.geoNetwork.region,cat.geoNetwork.subContinent,cat.trafficSource.campaign,cat.trafficSource.keyword,cat.trafficSource.medium,cat.trafficSource.referralPath,cat.trafficSource.source
0,1131660440785968503,0.0,1,1.0,1,8,15,0,1,4,...,210,0,37454,193,21,4,11,5,3196,208
1,377306020877927890,0.0,1,1.0,1,8,5,0,1,4,...,12,122,10098,482,1,4,11,5,3196,208
2,3895546263509774583,0.0,1,1.0,1,8,1,0,1,4,...,185,0,38725,99,19,4,11,5,3196,208
3,4763447161404445595,0.0,1,1.0,1,8,5,0,1,4,...,94,122,38725,482,16,4,1607,5,3196,208
4,27294437909732085,0.0,1,1.0,2,8,13,0,0,4,...,217,122,38725,482,13,4,11,5,3196,208


## Utilities 

In [7]:
# 5-fold cross validation 

unique_visitorId = processed_train_df['fullVisitorId'].unique()
random.seed(123)
random.shuffle(unique_visitorId)
no_cust = len(unique_visitorId)
print(no_cust)

fold = 5
id_cv = []
for i in range(fold):
    if i<fold-1:
        cur_cv = unique_visitorId[i*(no_cust//5):(i+1)*(no_cust//5)]
    else:
        cur_cv = unique_visitorId[i*(no_cust//5):no_cust]
    id_cv.append(cur_cv)  

715119


In [8]:
# Calculate RMSE based on the natural log of the predicted revenue for a customer.

def getMse(x_tr, train, val, log_y_tr_pred, log_y_val_pred):
    revenue = np.exp(log_y_tr_pred) - 1
    id_list = list(train['fullVisitorId'])
    
    d = {'fullVisitorId':id_list, 'PredictedRevenue':revenue}
    submit = pd.DataFrame(data=d)
    col = ['fullVisitorId', 'PredictedRevenue']
    submit = submit[col]
    submit = pd.DataFrame(submit.groupby('fullVisitorId')["PredictedRevenue"].sum().reset_index())
    
    
    submit['PredictedLogRevenue'] = np.log1p(submit['PredictedRevenue'])
    y_tr_pred = list(submit['PredictedLogRevenue'])
    
    y_train_sumrev = pd.DataFrame(train.groupby('fullVisitorId')["totals.transactionRevenue"].sum().reset_index())
    y_train_sumrev['totals.transactionRevenue'] = np.log1p(y_train_sumrev['totals.transactionRevenue'])
    y_tr = list(y_train_sumrev['totals.transactionRevenue'])
    
    mse_tr = mean_squared_error(y_tr, y_tr_pred)
#     print('train_mse', mse_tr)
#     print('train_rmse', np.sqrt(mse_tr))
    
    revenue = np.exp(log_y_val_pred) - 1
    id_list = list(val['fullVisitorId'])
    
    d = {'fullVisitorId':id_list, 'PredictedRevenue':revenue}
    submit = pd.DataFrame(data=d)
    col = ['fullVisitorId', 'PredictedRevenue']
    submit = submit[col]
    submit = pd.DataFrame(submit.groupby('fullVisitorId')["PredictedRevenue"].sum().reset_index())
    submit['PredictedLogRevenue'] = np.log1p(submit['PredictedRevenue'])
    y_val_pred = list(submit['PredictedLogRevenue'])
    
    y_val_sumrev = pd.DataFrame(val.groupby('fullVisitorId')["totals.transactionRevenue"].sum().reset_index())
    y_val_sumrev['totals.transactionRevenue'] = np.log1p(y_val_sumrev['totals.transactionRevenue'])
    y_val = list(y_val_sumrev['totals.transactionRevenue'])
    
    mse_val = mean_squared_error(y_val, y_val_pred)
    
    print('val_mse', mse_val)
    print('val_rmse', np.sqrt(mse_val))
    return mse_tr, mse_val


# predict testset and save the results as a file

def testset_pred(model, filename):
    x_tr = processed_train_df.iloc[:,2:]
    y_tr = processed_train_df.iloc[:,1]
    log_y_tr = np.log1p(y_tr)
    model.fit(x_tr, log_y_tr)
    
    x_te = processed_test_df.iloc[:,1:]
    te_id = processed_test_df.iloc[:,0]
    y_te_pred = model.predict(x_te)
    y_te_pred =[0 if i < 0 else i for i in y_te_pred]
    revenue = np.exp(y_te_pred) - 1
    
    # Write submit file
    d = {'fullVisitorId':te_id, 'PredictedLogRevenue':revenue}
    submit = pd.DataFrame(data=d)
    col = ['fullVisitorId', 'PredictedLogRevenue']
    submit = submit[col]
    submit = pd.DataFrame(submit.groupby('fullVisitorId')["PredictedLogRevenue"].sum().reset_index())
    submit['PredictedLogRevenue'] = np.log1p(submit['PredictedLogRevenue'])
    print(submit.shape)
    submit.head()
    submit.to_csv(filename+'.csv', index=False)

## Baseline1 -- Linear Regression 

In [9]:
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

train_mse = []
train_rmse = []
val_mse = []
val_rmse = []

for i in range(fold):
    print('\n\nfold:', i)
    val = processed_train_df[processed_train_df['fullVisitorId'].isin(id_cv[i])]
    train = processed_train_df[~processed_train_df['fullVisitorId'].isin(id_cv[i])]
    x_tr = train.iloc[:,2:]
    y_tr = train.iloc[:,1]
    log_y_tr = np.log1p(y_tr)
    x_val = val.iloc[:,2:]
    y_val = val.iloc[:,1]
    log_y_val = np.log1p(y_val)
    
    # --- INSERT YOUR MODEL -----
    model = LinearRegression().fit(x_tr, log_y_tr)
    log_y_tr_pred = model.predict(x_tr)
    # ---------------------------
    
    log_y_tr_pred = [0 if i < 0 else i for i in log_y_tr_pred]
    log_y_val_pred = model.predict(x_val)
    log_y_val_pred = [0 if i < 0 else i for i in log_y_val_pred]
    
    mse_tr, mse_val = getMse(x_tr, train, val, log_y_tr_pred, log_y_val_pred)
    train_mse.append(mse_tr)
    train_rmse.append(np.sqrt(mse_tr))
    val_mse.append(mse_val)
    val_rmse.append(np.sqrt(mse_val))


print('\n\nAverage:')
print('train_mse_5fold', np.mean(train_mse))
print('train_rmse_5fold', np.mean(train_rmse))
print('val_mse_5fold', np.mean(val_mse))
print('val_rmse_5fold', np.mean(val_rmse))



fold: 0
val_mse 3.3618812552441875
val_rmse 1.8335433606119567


fold: 1
val_mse 3.375392865948387
val_rmse 1.837224228543807


fold: 2
val_mse 3.472337652773637
val_rmse 1.8634209542595674


fold: 3
val_mse 3.282168156019326
val_rmse 1.8116755106859854


fold: 4
val_mse 3.3503185469529138
val_rmse 1.8303875400998866


Average:
train_mse_5fold 3.367800814939482
train_rmse_5fold 1.8351522014577792
val_mse_5fold 3.36841969538769
val_rmse_5fold 1.8352503188402405


## Baseline2 -- Polynomial Regression

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

train_mse = []
train_rmse = []
val_mse = []
val_rmse = []

for i in range(fold):
    print('\n\nfold:', i)
    val = processed_train_df[processed_train_df['fullVisitorId'].isin(id_cv[i])]
    train = processed_train_df[~processed_train_df['fullVisitorId'].isin(id_cv[i])]
    x_tr = train.iloc[:,2:]
    y_tr = train.iloc[:,1]
    log_y_tr = np.log1p(y_tr)
    x_val = val.iloc[:,2:]
    y_val = val.iloc[:,1]
    log_y_val = np.log1p(y_val)
    
    # --- INSERT YOUR MODEL -----
    model_pipeline = Pipeline([('poly',PolynomialFeatures(degree=2)),
                  ('linear', LinearRegression(fit_intercept=False))])
    model = model_pipeline.fit(x_tr, log_y_tr)
    log_y_tr_pred = model.predict(x_tr)
    # ---------------------------
    
    log_y_tr_pred = [0 if i < 0 else i for i in log_y_tr_pred]
    log_y_val_pred = model.predict(x_val)
    log_y_val_pred = [0 if i < 0 else i for i in log_y_val_pred]
    
    mse_tr, mse_val = getMse(x_tr, train, val, log_y_tr_pred, log_y_val_pred)
    train_mse.append(mse_tr)
    train_rmse.append(np.sqrt(mse_tr))
    val_mse.append(mse_val)
    val_rmse.append(np.sqrt(mse_val))


print('\n\nAverage:')
print('train_mse_5fold', np.mean(train_mse))
print('train_rmse_5fold', np.mean(train_rmse))
print('val_mse_5fold', np.mean(val_mse))
print('val_rmse_5fold', np.mean(val_rmse))



fold: 0
val_mse 2.961645111918738
val_rmse 1.7209430879371748


fold: 1
val_mse 2.9533518148788023
val_rmse 1.7185318777604337


fold: 2
val_mse 3.046064816613147
val_rmse 1.7452979162919857


fold: 3
val_mse 2.883753615400953
val_rmse 1.6981618342787452


fold: 4
val_mse 2.941866376119926
val_rmse 1.715186979929572


Average:
train_mse_5fold 2.9507632457828876
train_rmse_5fold 1.7177734882747795
val_mse_5fold 2.957336346986313
val_rmse_5fold 1.7196243392395825


## Regression Tree

In [12]:
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt

train_mse = []
train_rmse = []
val_mse = []
val_rmse = []

for i in range(fold):
    print('\n\nfold:', i)
    val = processed_train_df[processed_train_df['fullVisitorId'].isin(id_cv[i])]
    train = processed_train_df[~processed_train_df['fullVisitorId'].isin(id_cv[i])]
    x_tr = train.iloc[:,2:]
    y_tr = train.iloc[:,1]
    log_y_tr = np.log1p(y_tr)
    x_val = val.iloc[:,2:]
    y_val = val.iloc[:,1]
    log_y_val = np.log1p(y_val)
    
    # --- INSERT YOUR MODEL -----
    model = DecisionTreeRegressor(max_depth=10)
    model.fit(x_tr, log_y_tr)
    log_y_tr_pred = model.predict(x_tr)
    # ---------------------------
    
    log_y_tr_pred = [0 if i < 0 else i for i in log_y_tr_pred]
    log_y_val_pred = model.predict(x_val)
    log_y_val_pred = [0 if i < 0 else i for i in log_y_val_pred]
    
    mse_tr, mse_val = getMse(x_tr, train, val, log_y_tr_pred, log_y_val_pred)
    train_mse.append(mse_tr)
    train_rmse.append(np.sqrt(mse_tr))
    val_mse.append(mse_val)
    val_rmse.append(np.sqrt(mse_val))


print('\n\nAverage:')
print('train_mse_5fold', np.mean(train_mse))
print('train_rmse_5fold', np.mean(train_rmse))
print('val_mse_5fold', np.mean(val_mse))
print('val_rmse_5fold', np.mean(val_rmse))



fold: 0
val_mse 2.8428839174078973
val_rmse 1.6860853825971855


fold: 1
val_mse 2.9116198418790518
val_rmse 1.7063469289329916


fold: 2
val_mse 2.9485667324031497
val_rmse 1.717139112711358


fold: 3
val_mse 2.8259075295792253
val_rmse 1.6810435834859325


fold: 4
val_mse 2.840854597541982
val_rmse 1.6854834907355165


Average:
train_mse_5fold 2.503370674153782
train_rmse_5fold 1.5822033801046382
val_mse_5fold 2.8739665237622614
val_rmse_5fold 1.6952196996925966


## Preclassed Regression

In [13]:
# Add classification lables: nonzero-revenue as "1"; zero-revenue as "0"
processed_train_df['clf_label'] = np.where(processed_train_df['totals.transactionRevenue']==0.0, 0, 1)
processed_train_df.head()

Unnamed: 0,fullVisitorId,totals.transactionRevenue,num.totals.hits,num.totals.pageviews,num.visitNumber,cat.date,cat.visitStartTime,cat.totals.bounces,cat.totals.newVisits,cat.channelGrouping,...,cat.geoNetwork.metro,cat.geoNetwork.networkDomain,cat.geoNetwork.region,cat.geoNetwork.subContinent,cat.trafficSource.campaign,cat.trafficSource.keyword,cat.trafficSource.medium,cat.trafficSource.referralPath,cat.trafficSource.source,clf_label
0,1131660440785968503,0.0,1,1.0,1,8,15,0,1,4,...,0,37454,193,21,4,11,5,3196,208,0
1,377306020877927890,0.0,1,1.0,1,8,5,0,1,4,...,122,10098,482,1,4,11,5,3196,208,0
2,3895546263509774583,0.0,1,1.0,1,8,1,0,1,4,...,0,38725,99,19,4,11,5,3196,208,0
3,4763447161404445595,0.0,1,1.0,1,8,5,0,1,4,...,122,38725,482,16,4,1607,5,3196,208,0
4,27294437909732085,0.0,1,1.0,2,8,13,0,0,4,...,122,38725,482,13,4,11,5,3196,208,0


In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.tree import DecisionTreeRegressor

train_mse = []
train_rmse = []
val_mse = []
val_rmse = []
feature_list = [k for k in list(processed_train_df) if k not in ['fullVisitorId', 'totals.transactionRevenue', 'clf_label']]

for i in range(fold):
    print('\n\nfold:', i)
    val = processed_train_df[processed_train_df['fullVisitorId'].isin(id_cv[i])]
    train = processed_train_df[~processed_train_df['fullVisitorId'].isin(id_cv[i])]
    
    x_val = val[feature_list]
    y_clf_val = val['clf_label']
    y_val = val.iloc[:,1]
    log_y_val = np.log1p(y_val)
    
    nonzero_sample = train.loc[train[train['totals.transactionRevenue'] != 0.0].index]
    zero_indices = train[train['totals.transactionRevenue'] == 0.0].index
    random_indices = np.random.choice(zero_indices, nonzero_sample.shape[0], replace=False)
    zero_sample = train.loc[random_indices]
    undersampled_train_df = pd.concat([nonzero_sample, zero_sample])

    x_tr = undersampled_train_df[feature_list]
    y_clf_tr = undersampled_train_df['clf_label']
    y_tr = undersampled_train_df.iloc[:,1]
    log_y_tr = np.log1p(y_tr)
    
    nonzero_index_tr = []
    nonzero_index_val = []
    
    # ----- Insert Classification Model Here-----
    model = DecisionTreeClassifier(max_depth=8)
#     model = RandomForestClassifier(n_estimators=150, max_depth=15)
#     model = LogisticRegression(class_weight="balanced", solver='liblinear')
    # -------------------------------------------
    
    model.fit(x_tr, y_clf_tr)   
    y_clf_tr_pred = model.predict(x_tr)
    y_clf_val_pred = model.predict(x_val)
    
    for m in range(len(y_clf_tr_pred)):
        if y_clf_tr_pred[m] == 0:
            continue
        else:
            nonzero_index_tr.append(m)
    
    x_regr_tr = x_tr.iloc[nonzero_index_tr]
    y_regr_tr = undersampled_train_df.iloc[nonzero_index_tr,1]
    log_y_tr = np.log1p(y_regr_tr)
    
    for j in range(len(y_clf_val_pred)):
        if y_clf_val_pred[j] == 0:
            continue
        else:
            nonzero_index_val.append(j)
    
    x_regr_val = x_val.iloc[nonzero_index_val,]
    y_regr_val = val.iloc[nonzero_index_val,1]
    log_y_val = np.log1p(y_regr_val)
    
    x_tr1 = train[feature_list]
    y_tr1 = train.iloc[:,1]
    log_y_tr1 = np.log1p(y_tr1)
    
    # ----- Insert Regression Model Here-----
    model = DecisionTreeRegressor(max_depth=8).fit(x_tr1, log_y_tr1)
#     model_pipeline = Pipeline([('poly',PolynomialFeatures(degree=2)),
#                   ('linear', LinearRegression(fit_intercept=False))])
#     model = model_pipeline.fit(x_tr1, log_y_tr1)
#     model = LinearRegression().fit(x_tr1, log_y_tr1)
    # ---------------------------------------

    log_y_tr_pred = model.predict(x_regr_tr)
    tr_pred = list(0 for i in range(len(x_tr)))
    num = 0
    for index in nonzero_index_tr:
        tr_pred[index] = log_y_tr_pred[num]
        num += 1
    tr_pred = [0 if i < 0 else i for i in tr_pred]
    
    log_y_val_pred = model.predict(x_regr_val)
    val_pred = list(0 for i in range(len(x_val)))
    num = 0
    for index in nonzero_index_val:
        val_pred[index] = log_y_val_pred[num]
        num += 1
    val_pred = [0 if i < 0 else i for i in val_pred]
    
    mse_tr, mse_val = getMse(x_tr, undersampled_train_df, val, tr_pred, val_pred)
    train_mse.append(mse_tr)
    train_rmse.append(np.sqrt(mse_tr))
    val_mse.append(mse_val)
    val_rmse.append(np.sqrt(mse_val))


print('\n\nAverage:')
# print('train_mse_5fold', np.mean(train_mse))
# print('train_rmse_5fold', np.mean(train_rmse))
print('val_mse_5fold', np.mean(val_mse))
print('val_rmse_5fold', np.mean(val_rmse))



fold: 0
val_mse 2.80878747043265
val_rmse 1.6759437551518994


fold: 1
val_mse 2.8004942198934404
val_rmse 1.6734677229912265


fold: 2
val_mse 2.8680186697277277
val_rmse 1.6935225625092


fold: 3
val_mse 2.7524058046671547
val_rmse 1.659037614000103


fold: 4
val_mse 2.785543008411191
val_rmse 1.668994610060557


Average:
val_mse_5fold 2.8030498346264325
val_rmse_5fold 1.674193252942597
