In [1]:
import numpy as np 
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder



In [2]:
people = pd.read_csv("people.csv", dtype={'people_id': np.str, 'activity_id': np.str, 'char_38': np.int32}, parse_dates=['date'])
people[:2]

Unnamed: 0,people_id,char_1,group_1,char_2,date,char_3,char_4,char_5,char_6,char_7,...,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38
0,ppl_100,type 2,group 17304,type 2,2021-06-29,type 5,type 5,type 5,type 3,type 11,...,False,True,True,False,False,True,True,True,False,36
1,ppl_100002,type 2,group 8688,type 3,2021-01-06,type 28,type 9,type 5,type 3,type 11,...,False,True,True,True,True,True,True,True,False,76


In [3]:
train = pd.read_csv("act_train.csv",dtype={'people_id': np.str, 'activity_id': np.str, 'outcome': np.int8}, parse_dates=['date'])
train[:2]

Unnamed: 0,people_id,activity_id,date,activity_category,char_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,outcome
0,ppl_100,act2_1734928,2023-08-26,type 4,,,,,,,,,,type 76,0
1,ppl_100,act2_2434093,2022-09-27,type 2,,,,,,,,,,type 1,0


In [4]:
test = pd.read_csv("act_test.csv",dtype={'people_id': np.str, 'activity_id': np.str}, parse_dates=['date'])
test[:2]

Unnamed: 0,people_id,activity_id,date,activity_category,char_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10
0,ppl_100004,act1_249281,2022-07-20,type 1,type 5,type 10,type 5,type 1,type 6,type 1,type 1,type 7,type 4,
1,ppl_100004,act2_230855,2022-07-20,type 5,,,,,,,,,,type 682


In [5]:
print("Train data shape: " + format(train.shape))
print("Test data shape: " + format(test.shape))
print("People data shape: " + format(people.shape))

Train data shape: (2197291, 15)
Test data shape: (498687, 14)
People data shape: (189118, 41)


In [6]:
# people list the people number and their characteristics
# act_train, act_test set list the train and test set respectively
# we need to join the people with the train and test set so that
# their characteristics are captured as features.

In [7]:
X_train = train.merge(people, on='people_id', how='left', left_index=True)
X_test  = test.merge(people, on='people_id', how='left', left_index=True)

In [8]:
X_train[:2]

Unnamed: 0,people_id,activity_id,date_x,activity_category,char_1_x,char_2_x,char_3_x,char_4_x,char_5_x,char_6_x,...,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38
0,ppl_100,act2_1734928,2023-08-26,type 4,,,,,,,...,False,True,True,False,False,True,True,True,False,36
0,ppl_100,act2_2434093,2022-09-27,type 2,,,,,,,...,False,True,True,False,False,True,True,True,False,36


In [9]:
list(X_train)

['people_id',
 'activity_id',
 'date_x',
 'activity_category',
 'char_1_x',
 'char_2_x',
 'char_3_x',
 'char_4_x',
 'char_5_x',
 'char_6_x',
 'char_7_x',
 'char_8_x',
 'char_9_x',
 'char_10_x',
 'outcome',
 'char_1_y',
 'group_1',
 'char_2_y',
 'date_y',
 'char_3_y',
 'char_4_y',
 'char_5_y',
 'char_6_y',
 'char_7_y',
 'char_8_y',
 'char_9_y',
 'char_10_y',
 'char_11',
 'char_12',
 'char_13',
 'char_14',
 'char_15',
 'char_16',
 'char_17',
 'char_18',
 'char_19',
 'char_20',
 'char_21',
 'char_22',
 'char_23',
 'char_24',
 'char_25',
 'char_26',
 'char_27',
 'char_28',
 'char_29',
 'char_30',
 'char_31',
 'char_32',
 'char_33',
 'char_34',
 'char_35',
 'char_36',
 'char_37',
 'char_38']

In [10]:
X_train.shape

(2197291, 55)

In [11]:
# outcome is the y value of the training set

In [12]:
# char_10_y, char_11 => char_37 are boolean
# char_38 are numerical => no need to do anything
# char_1_y => char_9_y are categorical => do a one-hot encoding to get sparse matrix
# char_1_x => char_10_x are categorical => do a one-hot encoding to get sparse matrix
# group_1 is categorical
# date_x is the activity date, date_y is possibly the user registration date.
# activity_categorical is categorical
# people_id and activity_id will not contribute to the decision => can be used as index

In [13]:
used_separately = ['people_id', 'activity_id', 'date_x', 'date_y', 'outcome', 'char_38']
categorical = ['char_1_y', 'char_2_y', 'char_3_y', 'char_4_y', 'char_5_y',
               'char_6_y', 'char_7_y', 'char_8_y', 'char_9_y',
               'char_1_x', 'char_2_x', 'char_3_x', 'char_4_x', 'char_5_x',
               'char_6_x', 'char_7_x', 'char_8_x', 'char_9_x', 'char_10_x',
               'group_1', 'activity_category']
not_categorical = [x for x in X_train.columns if x not in categorical \
                   and x not in used_separately]
not_categorical

['char_10_y',
 'char_11',
 'char_12',
 'char_13',
 'char_14',
 'char_15',
 'char_16',
 'char_17',
 'char_18',
 'char_19',
 'char_20',
 'char_21',
 'char_22',
 'char_23',
 'char_24',
 'char_25',
 'char_26',
 'char_27',
 'char_28',
 'char_29',
 'char_30',
 'char_31',
 'char_32',
 'char_33',
 'char_34',
 'char_35',
 'char_36',
 'char_37']

In [14]:
def preprocessing(dataset):
    local_data = dataset
    
    for col in list(local_data.columns):
        # fill the categorical NA with type 0,
        # for each categorical col, we only take the number, and convert to int32
        if col in categorical:
            local_data[col].fillna('type 0', inplace=True)
            local_data[col] = local_data[col].apply(
                lambda x: x.split(' ')[1]).astype(np.int32)

        if col in not_categorical:
            # boolean values are converted to int aswell
            local_data[col] = local_data[col].astype(np.int8)
        # this doesnot deal with date time data
    
    # date_x is the activity date, date_y is possibly the reg date.
    local_data['year_x'] = local_data['date_x'].dt.year
    local_data['month_x'] = local_data['date_x'].dt.month
    local_data['day_x'] = local_data['date_x'].dt.day
    local_data['isweekend_x'] = (local_data['date_x'].dt.weekday >= 5).astype(int)
    # drop date_x after splitting its features
    local_data = local_data.drop('date_x', axis = 1)
    # we may not need date_y as its only reg date
    local_data = local_data.drop('date_y', axis = 1)
    return local_data

In [15]:
X_train = preprocessing(X_train)
X_train[:5]

Unnamed: 0,people_id,activity_id,activity_category,char_1_x,char_2_x,char_3_x,char_4_x,char_5_x,char_6_x,char_7_x,...,char_33,char_34,char_35,char_36,char_37,char_38,year_x,month_x,day_x,isweekend_x
0,ppl_100,act2_1734928,4,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2023,8,26,1
0,ppl_100,act2_2434093,2,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2022,9,27,0
0,ppl_100,act2_3404049,2,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2022,9,27,0
0,ppl_100,act2_3651215,2,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2023,8,4,0
0,ppl_100,act2_4109017,2,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2023,8,26,1


In [16]:
X_train=X_train.sort_values(['people_id'], ascending=[1])
X_train[:5]

Unnamed: 0,people_id,activity_id,activity_category,char_1_x,char_2_x,char_3_x,char_4_x,char_5_x,char_6_x,char_7_x,...,char_33,char_34,char_35,char_36,char_37,char_38,year_x,month_x,day_x,isweekend_x
0,ppl_100,act2_1734928,4,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2023,8,26,1
0,ppl_100,act2_2434093,2,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2022,9,27,0
0,ppl_100,act2_3404049,2,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2022,9,27,0
0,ppl_100,act2_3651215,2,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2023,8,4,0
0,ppl_100,act2_4109017,2,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2023,8,26,1


In [17]:
X_test = preprocessing(X_test)
X_test[:5]

Unnamed: 0,people_id,activity_id,activity_category,char_1_x,char_2_x,char_3_x,char_4_x,char_5_x,char_6_x,char_7_x,...,char_33,char_34,char_35,char_36,char_37,char_38,year_x,month_x,day_x,isweekend_x
3,ppl_100004,act1_249281,1,5,10,5,1,6,1,1,...,1,1,1,1,1,76,2022,7,20,0
3,ppl_100004,act2_230855,5,0,0,0,0,0,0,0,...,1,1,1,1,1,76,2022,7,20,0
5,ppl_10001,act1_240724,1,12,1,5,4,6,1,1,...,1,1,1,1,1,90,2022,10,14,0
5,ppl_10001,act1_83552,1,20,10,5,4,6,1,1,...,1,1,1,1,1,90,2022,11,27,1
5,ppl_10001,act2_1043301,5,0,0,0,0,0,0,0,...,1,1,1,1,1,90,2022,10,15,1


In [18]:
X_test=X_test.sort_values(['people_id'], ascending=[1])
X_test[:5]

Unnamed: 0,people_id,activity_id,activity_category,char_1_x,char_2_x,char_3_x,char_4_x,char_5_x,char_6_x,char_7_x,...,char_33,char_34,char_35,char_36,char_37,char_38,year_x,month_x,day_x,isweekend_x
3,ppl_100004,act1_249281,1,5,10,5,1,6,1,1,...,1,1,1,1,1,76,2022,7,20,0
3,ppl_100004,act2_230855,5,0,0,0,0,0,0,0,...,1,1,1,1,1,76,2022,7,20,0
5,ppl_10001,act2_688604,5,0,0,0,0,0,0,0,...,1,1,1,1,1,90,2022,11,28,0
5,ppl_10001,act2_659237,5,0,0,0,0,0,0,0,...,1,1,1,1,1,90,2022,10,16,1
5,ppl_10001,act2_649143,5,0,0,0,0,0,0,0,...,1,1,1,1,1,90,2022,11,27,1


In [19]:
y_train = X_train.outcome
X_train=X_train.drop('outcome',axis=1)

In [20]:
y_train[:5]

0    0
0    0
0    0
0    0
0    0
Name: outcome, dtype: int8

In [21]:
# now we need to take care of categorial data using one-hot-encoding.
# but the number of dimension resulted using one-hot-ending will equal the max number 
# in the categorical data.

In [22]:
# thus we need to reduce the number of dimension by checking only those numbers that
# appear in either train or test set.

# the following is a cheat because we're not supposed to use the data from the 
# test set for the preprocessing of the training set

# join the two set so the one hot encoding will have all the values for each categorical
# column
All = pd.concat([X_train, X_test], ignore_index=True)

In [23]:
All[categorical][:5]

Unnamed: 0,char_1_y,char_2_y,char_3_y,char_4_y,char_5_y,char_6_y,char_7_y,char_8_y,char_9_y,char_1_x,...,char_3_x,char_4_x,char_5_x,char_6_x,char_7_x,char_8_x,char_9_x,char_10_x,group_1,activity_category
0,2,2,5,5,5,3,11,2,2,0,...,0,0,0,0,0,0,0,76,17304,4
1,2,2,5,5,5,3,11,2,2,0,...,0,0,0,0,0,0,0,1,17304,2
2,2,2,5,5,5,3,11,2,2,0,...,0,0,0,0,0,0,0,1,17304,2
3,2,2,5,5,5,3,11,2,2,0,...,0,0,0,0,0,0,0,1,17304,2
4,2,2,5,5,5,3,11,2,2,0,...,0,0,0,0,0,0,0,1,17304,2


In [24]:
# then fit the one hot encoding using both sets
enc = OneHotEncoder(handle_unknown='ignore')
enc=enc.fit(All[categorical])


In [25]:
X_train_ohe=enc.transform(X_train[categorical])
X_train_ohe[:5]

<5x41499 sparse matrix of type '<class 'numpy.float64'>'
	with 105 stored elements in Compressed Sparse Row format>

In [26]:
X_train_ohe.shape

(2197291, 41499)

In [27]:
X_test_ohe=enc.transform(X_test[categorical])
X_test_ohe.shape

(498687, 41499)

In [28]:
not_categorical.extend(['char_38', 'year_x',
       'month_x', 'day_x', 'isweekend_x'])
not_categorical

['char_10_y',
 'char_11',
 'char_12',
 'char_13',
 'char_14',
 'char_15',
 'char_16',
 'char_17',
 'char_18',
 'char_19',
 'char_20',
 'char_21',
 'char_22',
 'char_23',
 'char_24',
 'char_25',
 'char_26',
 'char_27',
 'char_28',
 'char_29',
 'char_30',
 'char_31',
 'char_32',
 'char_33',
 'char_34',
 'char_35',
 'char_36',
 'char_37',
 'char_38',
 'year_x',
 'month_x',
 'day_x',
 'isweekend_x']

In [29]:
All.columns

Index(['people_id', 'activity_id', 'activity_category', 'char_1_x', 'char_2_x',
       'char_3_x', 'char_4_x', 'char_5_x', 'char_6_x', 'char_7_x', 'char_8_x',
       'char_9_x', 'char_10_x', 'char_1_y', 'group_1', 'char_2_y', 'char_3_y',
       'char_4_y', 'char_5_y', 'char_6_y', 'char_7_y', 'char_8_y', 'char_9_y',
       'char_10_y', 'char_11', 'char_12', 'char_13', 'char_14', 'char_15',
       'char_16', 'char_17', 'char_18', 'char_19', 'char_20', 'char_21',
       'char_22', 'char_23', 'char_24', 'char_25', 'char_26', 'char_27',
       'char_28', 'char_29', 'char_30', 'char_31', 'char_32', 'char_33',
       'char_34', 'char_35', 'char_36', 'char_37', 'char_38', 'year_x',
       'month_x', 'day_x', 'isweekend_x'],
      dtype='object')

In [30]:
from scipy.sparse import hstack


In [31]:
X_train_all=hstack((X_train[not_categorical], X_train_ohe))
X_train_all.shape

(2197291, 41532)

In [32]:
X_test_all=hstack((X_test[not_categorical], X_test_ohe))
X_test_all.shape

(498687, 41532)

In [33]:
print("Training data: " + format(X_train_all.shape))
print("Test data: " + format(X_test_all.shape))
print("###########")
print("One Hot enconded Test Dataset Script")

Training data: (2197291, 41532)
Test data: (498687, 41532)
###########
One Hot enconded Test Dataset Script


In [34]:
from xgboost import XGBRegressor
from sklearn.metrics import accuracy_score

In [35]:
dtrain = xgb.DMatrix(X_train_all,label=y_train)
dtest = xgb.DMatrix(X_test_all)

param = {'max_depth':10, 'eta':0.02, 'silent':1, 'objective':'binary:logistic',
         'nthread':-1, 'eval_metric':'auc', 'subsample':0.7, 'colsample_bytree':0.7,
         'min_child_weight':0, 'booster':"gblinear"}

watchlist  = [(dtrain,'train')]
num_round = 500
early_stopping_rounds=20
bst = xgb.train(param, dtrain, num_round, watchlist ,early_stopping_rounds=early_stopping_rounds)

ypred_xgboost = bst.predict(dtest)
output_xgboost = pd.DataFrame({ 'activity_id' : test['activity_id'], 'outcome': ypred_xgboost })
output_xgboost.to_csv('result_xgboost.csv', index = False)

[0]	train-auc:0.89192
Will train until train-auc hasn't improved in 20 rounds.
[1]	train-auc:0.900562
[2]	train-auc:0.909177
[3]	train-auc:0.917468
[4]	train-auc:0.925206
[5]	train-auc:0.932294
[6]	train-auc:0.938923
[7]	train-auc:0.945178
[8]	train-auc:0.951067
[9]	train-auc:0.956569
[10]	train-auc:0.961622
[11]	train-auc:0.966171
[12]	train-auc:0.970196
[13]	train-auc:0.973708
[14]	train-auc:0.976735
[15]	train-auc:0.979315
[16]	train-auc:0.981501
[17]	train-auc:0.983343
[18]	train-auc:0.984893
[19]	train-auc:0.986192
[20]	train-auc:0.987287
[21]	train-auc:0.988213
[22]	train-auc:0.989003
[23]	train-auc:0.989683
[24]	train-auc:0.990276
[25]	train-auc:0.990798
[26]	train-auc:0.991263
[27]	train-auc:0.991677
[28]	train-auc:0.99205
[29]	train-auc:0.992387
[30]	train-auc:0.992694
[31]	train-auc:0.992975
[32]	train-auc:0.993234
[33]	train-auc:0.993471
[34]	train-auc:0.99369
[35]	train-auc:0.993892
[36]	train-auc:0.994079
[37]	train-auc:0.994254
[38]	train-auc:0.994416
[39]	train-auc:0.994

In [36]:
# TODO:
# KNN
# Logistics Regression
# SGD Regression
# Adaboost
# Bagging
# Gradient Boosting
# RandomForest
# Deep Neural Net

In [45]:
# # KNN taking too long for sparse matrix
# # KNN fit
# from sklearn.neighbors import KNeighborsRegressor
# knn = KNeighborsRegressor(n_neighbors=5, n_jobs=-1)
# knn.fit(X_train_all, y_train) 

In [42]:
# # KNN predict
# ypred_knn = knn.predict(X_test_all)
# ypred_knn = pd.DataFrame({ 'activity_id' : test['activity_id'], 'outcome': ypred_knn })
# ypred_knn.to_csv('result_knn.csv', index = False)

In [44]:
# Logistics Regression fit
from sklearn.linear_model import LogisticRegression, SGDRegressor
logreg = LogisticRegression(C=1e5, verbose=5)
logreg.fit(X_train_all, y_train) 

[LibLinear]

LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=5, warm_start=False)

In [47]:
# Logistics Regression predict
ypred_logreg = logreg.predict(X_test_all)
ypred_logreg = pd.DataFrame({ 'activity_id' : test['activity_id'], 'outcome': ypred_logreg })
ypred_logreg.to_csv('result_logreg.csv', index = False)

In [37]:
# SGD Classifier
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier(verbose=1, n_iter=10, loss='log')
sgd.fit(X_train_all, y_train) 

-- Epoch 1
Norm: 763.52, NNZs: 31604, Bias: -0.023258, T: 2197291, Avg. loss: 20576.356453
Total training time: 0.97 seconds.
-- Epoch 2
Norm: 608.07, NNZs: 33881, Bias: -0.023205, T: 4394582, Avg. loss: 11106.347728
Total training time: 1.63 seconds.
-- Epoch 3
Norm: 544.77, NNZs: 34775, Bias: -0.023239, T: 6591873, Avg. loss: 7715.185509
Total training time: 2.28 seconds.
-- Epoch 4
Norm: 498.18, NNZs: 35257, Bias: -0.023265, T: 8789164, Avg. loss: 5948.849123
Total training time: 3.04 seconds.
-- Epoch 5
Norm: 469.18, NNZs: 35579, Bias: -0.023267, T: 10986455, Avg. loss: 4858.916162
Total training time: 3.81 seconds.
-- Epoch 6
Norm: 445.98, NNZs: 35806, Bias: -0.023277, T: 13183746, Avg. loss: 4116.448675
Total training time: 4.56 seconds.
-- Epoch 7
Norm: 426.76, NNZs: 35994, Bias: -0.023283, T: 15381037, Avg. loss: 3576.655968
Total training time: 5.32 seconds.
-- Epoch 8
Norm: 411.85, NNZs: 36119, Bias: -0.023290, T: 17578328, Avg. loss: 3165.894116
Total training time: 6.02 sec

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', n_iter=10, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=1, warm_start=False)

In [41]:
# # SGD predict
ypred_sgd = sgd.predict(X_test_all)
ypred_sgd = pd.DataFrame({ 'activity_id' : test['activity_id'], 'outcome': ypred_sgd })
ypred_sgd.to_csv('result_sgd.csv', index = False)

In [42]:
# Adaboost fit 
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
adaboost = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), n_estimators=100)
adaboost.fit(X_train_all, y_train) 

AdaBoostRegressor(base_estimator=DecisionTreeRegressor(criterion='mse', max_depth=4, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best'),
         learning_rate=1.0, loss='linear', n_estimators=100,
         random_state=None)

In [43]:
# Adaboost predict 
ypred_adaboost = adaboost.predict(X_test_all)
ypred_adaboost = pd.DataFrame({ 'activity_id' : test['activity_id'], 'outcome': ypred_adaboost })
ypred_adaboost.to_csv('result_adaboost.csv', index = False)

In [38]:
# RandomForest fit (too long for sparse matrix)
# from sklearn.ensemble import RandomForestRegressor
# random_forest = RandomForestRegressor(n_estimators=10, max_depth=100,
#                                       n_jobs=-1, verbose=2)
# random_forest.fit(X_train_all, y_train) 

In [39]:
# RandomForest predict
# ypred_random_forest = random_forest.predict(X_test_all)
# ypred_random_forest = pd.DataFrame({ 'activity_id' : test['activity_id'], 'outcome': ypred_random_forest })
# ypred_random_forest.to_csv('result_random_forest.csv', index = False)

In [35]:
# Gradient Boosting fit
from sklearn.ensemble import GradientBoostingRegressor
params = {'n_estimators': 100, 'max_depth': 10,
          'learning_rate': 0.01, 'loss': 'ls'}
gradient_boosting = GradientBoostingRegressor(**params)
gradient_boosting.fit(X_train_all, y_train) 

KeyboardInterrupt: 

In [None]:
# Gradient Boosting predict
ypred_gradient_boosting = gradient_boosting.predict(X_test_all)
ypred_gradient_boosting = pd.DataFrame({ 'activity_id' : X_test_all['activity_id'], 'outcome': ypred_gradient_boosting })
ypred_gradient_boosting.to_csv('result_gradient_boosting.csv', index = False)

In [None]:
# # Neural Net fit (too large the dimension for keras)
# from keras.models import Sequential
# from keras.layers import Dense
# import numpy
# dnn = Sequential()
# dnn.add(Dense(80000, input_dim=41532, init='uniform', activation='relu'))
# dnn.add(Dense(10000, init='uniform', activation='relu'))
# dnn.add(Dense(1000, init='uniform', activation='relu'))
# dnn.add(Dense(1, init='uniform', activation='sigmoid'))
# dnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# dnn.fit(X_train_all, y_train, nb_epoch=150, batch_size=10,  verbose=2)


In [None]:
# # Deep Neural Net predict
# # calculate predictions
# ypred_dnn = dnn.predict(X_test_all)
# # round predictions
# ypred_dnn_rounded = [round(x[0]) for x in ypred_dnn]
# ypred_dnn_rounded = pd.DataFrame({ 'activity_id' : test['activity_id'], 'outcome': ypred_dnn_rounded })
# ypred_dnn_rounded.to_csv('result_dnn.csv', index = False)

In [None]:
# # SVM polynomial fit (large data cannot run finish)
# from sklearn.svm import SVR
# svr_poly = SVR(kernel='poly', C=1e3, degree=3)
# svr_poly.fit(X_train_all, y_train)

In [None]:
# # SVM polynomial train
# ypred_svr_poly = svr_poly.predict(X_test_all)
# ypred_svr_poly = pd.DataFrame({ 'activity_id' : test['activity_id'], 'outcome': ypred_svr_poly })
# ypred_svr_poly.to_csv('result_svr_poly.csv', index = False)

In [37]:
# # Naive Bayes (not enough memory)
# from sklearn.naive_bayes import GaussianNB
# gnb = GaussianNB()
# gnb.fit(X_train_all.toarray(), y_train)
# y_pred_gnb = gnb.predict(X_test_all)
# y_pred_gnb = pd.DataFrame({ 'activity_id' : test['activity_id'], 'outcome': y_pred_gnb })
# y_pred_gnb.to_csv('result_gnb.csv', index = False)

MemoryError: 