In [1]:
import numpy as np 
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder



In [2]:
people = pd.read_csv("people.csv", dtype={'people_id': np.str, 'activity_id': np.str, 'char_38': np.int32}, parse_dates=['date'])
people[:2]

Unnamed: 0,people_id,char_1,group_1,char_2,date,char_3,char_4,char_5,char_6,char_7,...,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38
0,ppl_100,type 2,group 17304,type 2,2021-06-29,type 5,type 5,type 5,type 3,type 11,...,False,True,True,False,False,True,True,True,False,36
1,ppl_100002,type 2,group 8688,type 3,2021-01-06,type 28,type 9,type 5,type 3,type 11,...,False,True,True,True,True,True,True,True,False,76


In [3]:
train = pd.read_csv("act_train.csv",dtype={'people_id': np.str, 'activity_id': np.str, 'outcome': np.int8}, parse_dates=['date'])
train[:2]

Unnamed: 0,people_id,activity_id,date,activity_category,char_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,outcome
0,ppl_100,act2_1734928,2023-08-26,type 4,,,,,,,,,,type 76,0
1,ppl_100,act2_2434093,2022-09-27,type 2,,,,,,,,,,type 1,0


In [4]:
test = pd.read_csv("act_test.csv",dtype={'people_id': np.str, 'activity_id': np.str}, parse_dates=['date'])
test[:2]

Unnamed: 0,people_id,activity_id,date,activity_category,char_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10
0,ppl_100004,act1_249281,2022-07-20,type 1,type 5,type 10,type 5,type 1,type 6,type 1,type 1,type 7,type 4,
1,ppl_100004,act2_230855,2022-07-20,type 5,,,,,,,,,,type 682


In [5]:
print("Train data shape: " + format(train.shape))
print("Test data shape: " + format(test.shape))
print("People data shape: " + format(people.shape))

Train data shape: (2197291, 15)
Test data shape: (498687, 14)
People data shape: (189118, 41)


In [6]:
# people list the people number and their characteristics
# act_train, act_test set list the train and test set respectively
# we need to join the people with the train and test set so that
# their characteristics are captured as features.

In [7]:
X_train = train.merge(people, on='people_id', how='left', left_index=True)
X_test  = test.merge(people, on='people_id', how='left', left_index=True)

In [8]:
X_train[:2]

Unnamed: 0,people_id,activity_id,date_x,activity_category,char_1_x,char_2_x,char_3_x,char_4_x,char_5_x,char_6_x,...,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38
0,ppl_100,act2_1734928,2023-08-26,type 4,,,,,,,...,False,True,True,False,False,True,True,True,False,36
0,ppl_100,act2_2434093,2022-09-27,type 2,,,,,,,...,False,True,True,False,False,True,True,True,False,36


In [9]:
list(X_train)

['people_id',
 'activity_id',
 'date_x',
 'activity_category',
 'char_1_x',
 'char_2_x',
 'char_3_x',
 'char_4_x',
 'char_5_x',
 'char_6_x',
 'char_7_x',
 'char_8_x',
 'char_9_x',
 'char_10_x',
 'outcome',
 'char_1_y',
 'group_1',
 'char_2_y',
 'date_y',
 'char_3_y',
 'char_4_y',
 'char_5_y',
 'char_6_y',
 'char_7_y',
 'char_8_y',
 'char_9_y',
 'char_10_y',
 'char_11',
 'char_12',
 'char_13',
 'char_14',
 'char_15',
 'char_16',
 'char_17',
 'char_18',
 'char_19',
 'char_20',
 'char_21',
 'char_22',
 'char_23',
 'char_24',
 'char_25',
 'char_26',
 'char_27',
 'char_28',
 'char_29',
 'char_30',
 'char_31',
 'char_32',
 'char_33',
 'char_34',
 'char_35',
 'char_36',
 'char_37',
 'char_38']

In [10]:
X_train.shape

(2197291, 55)

In [11]:
# outcome is the y value of the training set

In [12]:
# char_10_y, char_11 => char_37 are boolean
# char_38 are numerical => no need to do anything
# char_1_y => char_9_y are categorical => do a one-hot encoding to get sparse matrix
# char_1_x => char_10_x are categorical => do a one-hot encoding to get sparse matrix
# group_1 is categorical
# date_x is the activity date, date_y is possibly the user registration date.
# activity_categorical is categorical
# people_id and activity_id will not contribute to the decision => can be used as index

In [13]:
used_separately = ['people_id', 'activity_id', 'date_x', 'date_y', 'outcome', 'char_38']
categorical = ['char_1_y', 'char_2_y', 'char_3_y', 'char_4_y', 'char_5_y',
               'char_6_y', 'char_7_y', 'char_8_y', 'char_9_y',
               'char_1_x', 'char_2_x', 'char_3_x', 'char_4_x', 'char_5_x',
               'char_6_x', 'char_7_x', 'char_8_x', 'char_9_x', 'char_10_x',
               'group_1', 'activity_category']
not_categorical = [x for x in X_train.columns if x not in categorical \
                   and x not in used_separately]

In [14]:
def preprocessing(dataset):
    local_data = dataset
    
    for col in list(local_data.columns):
        # fill the categorical NA with type 0,
        # for each categorical col, we only take the number, and convert to int32
        if col in categorical:
            local_data[col].fillna('type 0', inplace=True)
            local_data[col] = local_data[col].apply(
                lambda x: x.split(' ')[1]).astype(np.int32)

        if col in not_categorical:
            # boolean values are converted to int aswell
            local_data[col] = local_data[col].astype(np.int8)
        # this doesnot deal with date time data
    
    # date_x is the activity date, date_y is possibly the reg date.
    local_data['year_x'] = local_data['date_x'].dt.year
    local_data['month_x'] = local_data['date_x'].dt.month
    local_data['day_x'] = local_data['date_x'].dt.day
    local_data['isweekend_x'] = (local_data['date_x'].dt.weekday >= 5).astype(int)
    # drop date_x after splitting its features
    local_data = local_data.drop('date_x', axis = 1)
    # we may not need date_y as its only reg date
    local_data = local_data.drop('date_y', axis = 1)
    return local_data

In [15]:
X_train = preprocessing(X_train)
X_train[:5]

Unnamed: 0,people_id,activity_id,activity_category,char_1_x,char_2_x,char_3_x,char_4_x,char_5_x,char_6_x,char_7_x,...,char_33,char_34,char_35,char_36,char_37,char_38,year_x,month_x,day_x,isweekend_x
0,ppl_100,act2_1734928,4,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2023,8,26,1
0,ppl_100,act2_2434093,2,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2022,9,27,0
0,ppl_100,act2_3404049,2,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2022,9,27,0
0,ppl_100,act2_3651215,2,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2023,8,4,0
0,ppl_100,act2_4109017,2,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2023,8,26,1


In [16]:
X_train=X_train.sort_values(['people_id'], ascending=[1])
X_train[:5]

Unnamed: 0,people_id,activity_id,activity_category,char_1_x,char_2_x,char_3_x,char_4_x,char_5_x,char_6_x,char_7_x,...,char_33,char_34,char_35,char_36,char_37,char_38,year_x,month_x,day_x,isweekend_x
0,ppl_100,act2_1734928,4,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2023,8,26,1
0,ppl_100,act2_2434093,2,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2022,9,27,0
0,ppl_100,act2_3404049,2,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2022,9,27,0
0,ppl_100,act2_3651215,2,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2023,8,4,0
0,ppl_100,act2_4109017,2,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2023,8,26,1


In [17]:
X_test = preprocessing(X_test)
X_test[:5]

Unnamed: 0,people_id,activity_id,activity_category,char_1_x,char_2_x,char_3_x,char_4_x,char_5_x,char_6_x,char_7_x,...,char_33,char_34,char_35,char_36,char_37,char_38,year_x,month_x,day_x,isweekend_x
3,ppl_100004,act1_249281,1,5,10,5,1,6,1,1,...,1,1,1,1,1,76,2022,7,20,0
3,ppl_100004,act2_230855,5,0,0,0,0,0,0,0,...,1,1,1,1,1,76,2022,7,20,0
5,ppl_10001,act1_240724,1,12,1,5,4,6,1,1,...,1,1,1,1,1,90,2022,10,14,0
5,ppl_10001,act1_83552,1,20,10,5,4,6,1,1,...,1,1,1,1,1,90,2022,11,27,1
5,ppl_10001,act2_1043301,5,0,0,0,0,0,0,0,...,1,1,1,1,1,90,2022,10,15,1


In [18]:
X_test=X_test.sort_values(['people_id'], ascending=[1])
X_test[:5]

Unnamed: 0,people_id,activity_id,activity_category,char_1_x,char_2_x,char_3_x,char_4_x,char_5_x,char_6_x,char_7_x,...,char_33,char_34,char_35,char_36,char_37,char_38,year_x,month_x,day_x,isweekend_x
3,ppl_100004,act1_249281,1,5,10,5,1,6,1,1,...,1,1,1,1,1,76,2022,7,20,0
3,ppl_100004,act2_230855,5,0,0,0,0,0,0,0,...,1,1,1,1,1,76,2022,7,20,0
5,ppl_10001,act2_688604,5,0,0,0,0,0,0,0,...,1,1,1,1,1,90,2022,11,28,0
5,ppl_10001,act2_659237,5,0,0,0,0,0,0,0,...,1,1,1,1,1,90,2022,10,16,1
5,ppl_10001,act2_649143,5,0,0,0,0,0,0,0,...,1,1,1,1,1,90,2022,11,27,1


In [19]:
y_train = X_train.outcome
X_train=X_train.drop('outcome',axis=1)

In [20]:
y_train[:5]

0    0
0    0
0    0
0    0
0    0
Name: outcome, dtype: int8

In [21]:
# now we need to take care of categorial data using one-hot-encoding.
# but the number of dimension resulted using one-hot-ending will equal the max number 
# in the categorical data.

In [22]:
# thus we need to reduce the number of dimension by checking only those numbers that
# appear in either train or test set.

# the following is a cheat because we're not supposed to use the data from the 
# test set for the preprocessing of the training set

# join the two set so the one hot encoding will have all the values for each categorical
# column
All = pd.concat([X_train, X_test], ignore_index=True)

In [23]:
All[categorical][:5]

Unnamed: 0,char_1_y,char_2_y,char_3_y,char_4_y,char_5_y,char_6_y,char_7_y,char_8_y,char_9_y,char_1_x,...,char_3_x,char_4_x,char_5_x,char_6_x,char_7_x,char_8_x,char_9_x,char_10_x,group_1,activity_category
0,2,2,5,5,5,3,11,2,2,0,...,0,0,0,0,0,0,0,76,17304,4
1,2,2,5,5,5,3,11,2,2,0,...,0,0,0,0,0,0,0,1,17304,2
2,2,2,5,5,5,3,11,2,2,0,...,0,0,0,0,0,0,0,1,17304,2
3,2,2,5,5,5,3,11,2,2,0,...,0,0,0,0,0,0,0,1,17304,2
4,2,2,5,5,5,3,11,2,2,0,...,0,0,0,0,0,0,0,1,17304,2


In [24]:
# # then fit the one hot encoding using both sets
# enc = OneHotEncoder(handle_unknown='ignore')
# enc=enc.fit(All[categorical])
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for category in categorical:
   # Encoding only categorical variables
   le.fit(All[category])
   X_train[category]=le.transform(X_train[category])
   X_test[category]=le.transform(X_test[category])

In [25]:
X_train[:5]

Unnamed: 0,people_id,activity_id,activity_category,char_1_x,char_2_x,char_3_x,char_4_x,char_5_x,char_6_x,char_7_x,...,char_33,char_34,char_35,char_36,char_37,char_38,year_x,month_x,day_x,isweekend_x
0,ppl_100,act2_1734928,3,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2023,8,26,1
0,ppl_100,act2_2434093,1,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2022,9,27,0
0,ppl_100,act2_3404049,1,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2022,9,27,0
0,ppl_100,act2_3651215,1,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2023,8,4,0
0,ppl_100,act2_4109017,1,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2023,8,26,1


In [26]:
X_train.shape

(2197291, 56)

In [27]:
X_test.shape

(498687, 56)

In [28]:
not_categorical.extend(['char_38', 'year_x',
       'month_x', 'day_x', 'isweekend_x'])
not_categorical

['char_10_y',
 'char_11',
 'char_12',
 'char_13',
 'char_14',
 'char_15',
 'char_16',
 'char_17',
 'char_18',
 'char_19',
 'char_20',
 'char_21',
 'char_22',
 'char_23',
 'char_24',
 'char_25',
 'char_26',
 'char_27',
 'char_28',
 'char_29',
 'char_30',
 'char_31',
 'char_32',
 'char_33',
 'char_34',
 'char_35',
 'char_36',
 'char_37',
 'char_38',
 'year_x',
 'month_x',
 'day_x',
 'isweekend_x']

In [29]:
print("Training data: " + format(X_train.shape))
print("Test data: " + format(X_test.shape))

Training data: (2197291, 56)
Test data: (498687, 56)


In [30]:
categorical

['char_1_y',
 'char_2_y',
 'char_3_y',
 'char_4_y',
 'char_5_y',
 'char_6_y',
 'char_7_y',
 'char_8_y',
 'char_9_y',
 'char_1_x',
 'char_2_x',
 'char_3_x',
 'char_4_x',
 'char_5_x',
 'char_6_x',
 'char_7_x',
 'char_8_x',
 'char_9_x',
 'char_10_x',
 'group_1',
 'activity_category']

In [31]:
from xgboost import XGBRegressor
from sklearn.metrics import accuracy_score

In [32]:
X_train=X_train.drop('people_id',axis=1)
X_train=X_train.drop('activity_id',axis=1)
X_test=X_test.drop('people_id',axis=1)
X_test=X_test.drop('activity_id',axis=1)

In [33]:
dtrain = xgb.DMatrix(X_train,label=y_train)
dtest = xgb.DMatrix(X_test)

param = {'max_depth':10, 'eta':0.02, 'silent':1, 'objective':'binary:logistic',
         'nthread':-1, 'eval_metric':'auc', 'subsample':0.7, 'colsample_bytree':0.7,
         'min_child_weight':0, 'booster':"gbtree"}

watchlist  = [(dtrain,'train')]
num_round = 300
early_stopping_rounds=15
bst = xgb.train(param, dtrain, num_round, watchlist ,early_stopping_rounds=early_stopping_rounds)

ypred_xgboost = bst.predict(dtest)
output_xgboost = pd.DataFrame({ 'activity_id' : test['activity_id'], 'outcome': ypred_xgboost })
output_xgboost.to_csv('result_xgboost_le_tree.csv', index = False)

[0]	train-auc:0.94067
Will train until train-auc hasn't improved in 15 rounds.
[1]	train-auc:0.950897
[2]	train-auc:0.95446
[3]	train-auc:0.954679
[4]	train-auc:0.957208
[5]	train-auc:0.957325
[6]	train-auc:0.957075
[7]	train-auc:0.958705
[8]	train-auc:0.958271
[9]	train-auc:0.958109
[10]	train-auc:0.95907
[11]	train-auc:0.958851
[12]	train-auc:0.958791
[13]	train-auc:0.958773
[14]	train-auc:0.959303
[15]	train-auc:0.959313
[16]	train-auc:0.960504
[17]	train-auc:0.96124
[18]	train-auc:0.961313
[19]	train-auc:0.9617
[20]	train-auc:0.961607
[21]	train-auc:0.962057
[22]	train-auc:0.962495
[23]	train-auc:0.96283
[24]	train-auc:0.962797
[25]	train-auc:0.962783
[26]	train-auc:0.963066
[27]	train-auc:0.963462
[28]	train-auc:0.963871
[29]	train-auc:0.963991
[30]	train-auc:0.964139
[31]	train-auc:0.964188
[32]	train-auc:0.964307
[33]	train-auc:0.964387
[34]	train-auc:0.964432
[35]	train-auc:0.964768
[36]	train-auc:0.965012
[37]	train-auc:0.965113
[38]	train-auc:0.965391
[39]	train-auc:0.965467


In [34]:
# Logistics Regression fit
from sklearn.linear_model import LogisticRegression, SGDRegressor
logreg = LogisticRegression(C=1e5, verbose=5)
logreg.fit(X_train, y_train)

[LibLinear]

LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=5, warm_start=False)

In [35]:
# Logistics Regression predict
ypred_logreg = logreg.predict(X_test)
ypred_logreg = pd.DataFrame({ 'activity_id' : test['activity_id'], 'outcome': ypred_logreg })
ypred_logreg.to_csv('result_logreg_le.csv', index = False)

In [36]:
# SGD Classifier
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier(verbose=1, n_iter=10, loss='log')
sgd.fit(X_train, y_train) 

-- Epoch 1
Norm: 15300.37, NNZs: 54, Bias: 35.350023, T: 2197291, Avg. loss: 1221843.271585
Total training time: 0.57 seconds.
-- Epoch 2
Norm: 8390.21, NNZs: 54, Bias: 35.316840, T: 4394582, Avg. loss: 655865.679512
Total training time: 1.12 seconds.
-- Epoch 3
Norm: 5826.46, NNZs: 54, Bias: 35.306490, T: 6591873, Avg. loss: 454830.485759
Total training time: 1.67 seconds.
-- Epoch 4
Norm: 4511.11, NNZs: 54, Bias: 35.310257, T: 8789164, Avg. loss: 350470.577902
Total training time: 2.22 seconds.
-- Epoch 5
Norm: 3711.90, NNZs: 54, Bias: 35.300310, T: 10986455, Avg. loss: 286178.184657
Total training time: 2.78 seconds.
-- Epoch 6
Norm: 3172.50, NNZs: 54, Bias: 35.296865, T: 13183746, Avg. loss: 242422.711583
Total training time: 3.33 seconds.
-- Epoch 7
Norm: 2797.34, NNZs: 54, Bias: 35.297137, T: 15381037, Avg. loss: 210650.401705
Total training time: 3.87 seconds.
-- Epoch 8
Norm: 2500.23, NNZs: 54, Bias: 35.291379, T: 17578328, Avg. loss: 186482.061326
Total training time: 4.46 sec

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', n_iter=10, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=1, warm_start=False)

In [37]:
# # SGD predict
ypred_sgd = sgd.predict(X_test)
ypred_sgd = pd.DataFrame({ 'activity_id' : test['activity_id'], 'outcome': ypred_sgd })
ypred_sgd.to_csv('result_sgd_le.csv', index = False)

In [39]:
# Adaboost fit 
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
adaboost = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), n_estimators=100)
adaboost.fit(X_train, y_train) 

AdaBoostRegressor(base_estimator=DecisionTreeRegressor(criterion='mse', max_depth=4, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best'),
         learning_rate=1.0, loss='linear', n_estimators=100,
         random_state=None)

In [40]:
# Adaboost predict 
ypred_adaboost = adaboost.predict(X_test)
ypred_adaboost = pd.DataFrame({ 'activity_id' : test['activity_id'], 'outcome': ypred_adaboost })
ypred_adaboost.to_csv('result_adaboost_le.csv', index = False)

In [41]:
# RandomForest fit (too long for sparse matrix)
from sklearn.ensemble import RandomForestRegressor
random_forest = RandomForestRegressor(n_estimators=50, n_jobs=-1, verbose=1)
random_forest.fit(X_train, y_train) 

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  3.7min finished


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=50, n_jobs=-1, oob_score=False, random_state=None,
           verbose=1, warm_start=False)

In [42]:
# RandomForest predict
ypred_random_forest = random_forest.predict(X_test)
ypred_random_forest = pd.DataFrame({ 'activity_id' : test['activity_id'], 'outcome': ypred_random_forest })
ypred_random_forest.to_csv('result_random_forest_le.csv', index = False)

[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.6s finished


In [45]:
# Gradient Boosting fit
from sklearn.ensemble import GradientBoostingRegressor
params = {'n_estimators': 50, 'max_depth': 10,
          'learning_rate': 0.01, 'loss': 'ls', 'verbose':1}
gradient_boosting = GradientBoostingRegressor(**params)
gradient_boosting.fit(X_train, y_train) 

      Iter       Train Loss   Remaining Time 
         1           0.2437          117.36m
         2           0.2405          111.93m
         3           0.2375          110.81m
         4           0.2344          108.30m
         5           0.2315          105.36m
         6           0.2286          103.29m
         7           0.2257           99.62m
         8           0.2229           96.30m
         9           0.2202           93.27m
        10           0.2175           90.43m
        20           0.1934           66.27m
        30           0.1735           44.67m
        40           0.1569           22.41m
        50           0.1429            0.00s


GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.01, loss='ls', max_depth=10,
             max_features=None, max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=50, presort='auto', random_state=None,
             subsample=1.0, verbose=1, warm_start=False)

In [None]:
# Gradient Boosting predict
ypred_gradient_boosting = gradient_boosting.predict(X_test)
ypred_gradient_boosting = pd.DataFrame({ 'activity_id' : test['activity_id'], 'outcome': ypred_gradient_boosting })
ypred_gradient_boosting.to_csv('result_gradient_boosting_le.csv', index = False)

In [None]:
# # SVM polynomial fit (large data cannot run finish)
# # Cannot scale with data set > 10000
# from sklearn.svm import SVR
# svr_poly = SVR(kernel='poly', C=1e3, degree=3)
# svr_poly.fit(X_train, y_train)

In [None]:
# # SVM polynomial train
# ypred_svr_poly = svr_poly.predict(X_test)
# ypred_svr_poly = pd.DataFrame({ 'activity_id' : test['activity_id'], 'outcome': ypred_svr_poly })
# ypred_svr_poly.to_csv('result_svr_poly_le.csv', index = False)

In [33]:
# Naive Bayes 
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred_gnb = gnb.predict(X_test)
y_pred_gnb = pd.DataFrame({ 'activity_id' : test['activity_id'], 'outcome': y_pred_gnb })
y_pred_gnb.to_csv('result_gnb_le.csv', index = False)

In [39]:
# Neural Net fit (too large the dimension for keras)
from keras.models import Sequential
from keras.layers import Dense
import numpy
dnn = Sequential()
dnn.add(Dense(150, input_dim=54, kernel_initializer='uniform', activation='relu'))
dnn.add(Dense(20, kernel_initializer='uniform', activation='relu'))
dnn.add(Dense(1, kernel_initializer='uniform', activation='sigmoid'))
dnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
dnn.fit(np.array(X_train), y_train, epochs=100, batch_size=1000,  verbose=5)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f8d720a5c18>

In [42]:
# Deep Neural Net predict
# calculate predictions
ypred_dnn = dnn.predict(np.array(X_test))
# round predictions
ypred_dnn_rounded = [round(x[0]) for x in ypred_dnn]
ypred_dnn_rounded = pd.DataFrame({ 'activity_id' : test['activity_id'], 'outcome': ypred_dnn_rounded })
ypred_dnn_rounded.to_csv('result_dnn_le.csv', index = False)