In [41]:
import pandas as pd
import os
import time
import utils

import numpy as np

from sklearn import linear_model
from sklearn import tree
from sklearn import neighbors
from sklearn import naive_bayes
from sklearn import svm
from sklearn import ensemble

from sklearn import model_selection as ms
from sklearn import metrics

In [36]:
DATA_DIR = "data/"
DATA_FILE_TRAIN = os.path.join(DATA_DIR, "bow/train_bow.csv")
DATA_FILE_TEST = os.path.join(DATA_DIR, "bow/test_bow.csv")

In [25]:
df = utils.csv_to_dataframe(DATA_FILE_TRAIN)
df.head(3)    

Unnamed: 0,kw_ablaze,kw_accident,kw_aftershock,kw_airplaneaccident,kw_ambulance,kw_annihilated,kw_annihilation,kw_apocalypse,kw_armageddon,kw_army,...,ûïthe,ûïwe,ûïwhen,ûò,ûó,û÷extrem,û÷polit,åè,åê,target.1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [26]:
column_labels = list(range(0, 3167))
df.columns = column_labels
df.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3157,3158,3159,3160,3161,3162,3163,3164,3165,3166
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [27]:
array = df.values
x = array[:,0:3166]   # feature values
y = array[:,3166]     # targets
validation_size = 0.10
seed = 11
x_train, x_validation, y_train, y_validation = ms.train_test_split(x, y, test_size=validation_size, random_state=seed)

print(x.shape)
print(y.shape)
print('')
print(x_train.shape)
print(x_validation.shape)
print('')
print(y_train.shape)
print(y_validation.shape)

print(type(x_train))
print(type(y_train))

(7613, 3166)
(7613,)

(6851, 3166)
(762, 3166)

(6851,)
(762,)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


(7613, 3166)
(7613,)

(6851, 3166)
(762, 3166)

(6851,)
(762,)


In [13]:
# all using default hyperparameters

models =[]
models.append(('LogReg', linear_model.LogisticRegression(solver='liblinear')))
models.append(('GaussNB', naive_bayes.GaussianNB()))
#models.append(('KNN', neighbors.KNeighborsClassifier()))
#models.append(('CART', tree.DecisionTreeClassifier()))
#models.append(('SVM', svm.SVC(gamma='auto')))

ensembles =[]
ensembles.append(('RandomF50', ensemble.RandomForestClassifier(n_estimators=50)))
ensembles.append(('RandomF100', ensemble.RandomForestClassifier(n_estimators=100)))
#ensembles.append(('AdaBoost', ensemble.AdaBoostClassifier()))
#ensembles.append(('GradBoost', ensemble.GradientBoostingClassifier()))


In [14]:
num_folds = 10
seed = 11
scoring = 'accuracy'

results = []
names = []

In [15]:
for name, model in models:
    start = time.time()
    kfold = ms.KFold(n_splits=num_folds, random_state=seed, shuffle=True)
    cv_results = ms.cross_val_score(model, x_train, y_train, cv=kfold, scoring=scoring)
    elapsed_time = time.time() - start
    
    results.append(cv_results)
    names.append(name)
    
    # print name, mean accuracy, standard deviation of accuracy, time taken
    print(f'{name}: \t {cv_results.mean()} \t {cv_results.std()} \t {elapsed_time}')

LogReg: 	 0.7965274201442829 	 0.014668674135782078 	 1.9800314903259277
GaussNB: 	 0.7009233683045689 	 0.0163427215362928 	 3.028402805328369


In [16]:
for name, model in ensembles:
    start = time.time()
    kfold = ms.KFold(n_splits=num_folds, random_state=seed, shuffle=True)
    cv_results = ms.cross_val_score(model, x_train, y_train, cv=kfold, scoring=scoring)
    elapsed_time = time.time() - start
    
    results.append(cv_results)
    names.append(name)
    
    print(f'{name}: \t {cv_results.mean()} \t {cv_results.std()} \t {elapsed_time}')

RandomF50: 	 0.774922644761763 	 0.013687268877683317 	 126.58327054977417
RandomF100: 	 0.777550594794748 	 0.010753699720533291 	 258.33744716644287


In [33]:
model_logreg = linear_model.LogisticRegression(solver='liblinear')
model_logreg.fit(x_train, y_train)
predictions = model_logreg.predict(x_validation)

score = metrics.accuracy_score(y_validation, predictions)
matrix = metrics.confusion_matrix(y_validation, predictions)
report = metrics.classification_report(y_validation, predictions)

print('Logistic Regression')
print(score)
print(matrix)
print(report)

Logistic Regression
0.8070866141732284
[[370  68]
 [ 79 245]]
              precision    recall  f1-score   support

           0       0.82      0.84      0.83       438
           1       0.78      0.76      0.77       324

    accuracy                           0.81       762
   macro avg       0.80      0.80      0.80       762
weighted avg       0.81      0.81      0.81       762



### **KAGGLE TEST SET**

In [20]:
df_train = utils.csv_to_dataframe(DATA_FILE_TRAIN)
df_train.head(3) 

Unnamed: 0,kw_ablaze,kw_accident,kw_aftershock,kw_airplaneaccident,kw_ambulance,kw_annihilated,kw_annihilation,kw_apocalypse,kw_armageddon,kw_army,...,ûïthe,ûïwe,ûïwhen,ûò,ûó,û÷extrem,û÷polit,åè,åê,target.1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [22]:
df_test = utils.csv_to_dataframe(DATA_FILE_TEST)
df_test.head(3)    



Unnamed: 0,kw_ablaze,kw_accident,kw_aftershock,kw_airplaneaccident,kw_ambulance,kw_annihilated,kw_annihilation,kw_apocalypse,kw_armageddon,kw_army,...,ûïa,ûïthe,ûïwe,ûïwhen,ûò,ûó,û÷extrem,û÷polit,åè,åê
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
column_labels = list(range(0, 3166))
df_test.columns = column_labels
df_test.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3156,3157,3158,3159,3160,3161,3162,3163,3164,3165
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
array_test = df_test.values
x_test = array_test[:,0:3166]   # feature values
print(type(x_test))
print(x_test.shape)

<class 'numpy.ndarray'>
(3263, 3166)


In [60]:
predictions = model_logreg.predict(x_test)
predictions_list = np.array(predictions).tolist()

print(type(predictions_list))
print(len(predictions_list))
print(predictions_list[0:10])

<class 'list'>
3263
[1, 0, 1, 1, 1, 1, 0, 0, 0, 0]


In [62]:
DATA_FILE_TEST_ID = os.path.join(DATA_DIR, "test_id.csv")

test_id_list = utils.csv_to_list_of_strings(DATA_FILE_TEST_ID)

print(len(test_id_list))
print(test_id_list[0:10])

3263
['0', '2', '3', '9', '11', '12', '21', '22', '27', '29']


In [79]:
df_test_id = utils.csv_to_dataframe(DATA_FILE_TEST_ID)
df_test_id.head(10)

Unnamed: 0,id
0,0
1,2
2,3
3,9
4,11
5,12
6,21
7,22
8,27
9,29


In [80]:
df_test_predict = pd.DataFrame({'col':predictions_list})
df_test_predict.columns = ['target']
df_test_predict.head(10)

Unnamed: 0,target
0,1
1,0
2,1
3,1
4,1
5,1
6,0
7,0
8,0
9,0


In [82]:
df_submit = pd.concat([df_test_id, df_test_predict], axis=1)
df_submit.head(10)

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,1
4,11,1
5,12,1
6,21,0
7,22,0
8,27,0
9,29,0


In [83]:
DATA_FILE_SUBMIT = os.path.join(DATA_DIR, "mysubmission.csv")

utils.dataframe_to_csv(df_submit, DATA_FILE_SUBMIT)