In [1]:
import time
import os
import pandas as pd

from sklearn import model_selection
from sklearn import metrics

from xgboost import XGBClassifier

import matplotlib
from matplotlib import pyplot
matplotlib.use('Agg')

import utils

In [5]:
DATA_DIR = "data/"
DATA_FILE_TRAIN = os.path.join(DATA_DIR, "bowfinal/train_bow.csv")
DATA_FILE_TEST = os.path.join(DATA_DIR, "bowfinal/test_bow.csv")

In [6]:
df_train = pd.read_csv(DATA_FILE_TRAIN)
c = len(df_train.columns)

array_train = df_train.values       # np array

In [7]:
df_train.head(3)

Unnamed: 0,kw_ablaz,kw_accid,kw_aftershock,kw_airplaneaccid,kw_ambul,kw_annihil,kw_apocalyps,kw_armageddon,kw_armi,kw_arson,...,youtub,youûªv,yr,zombi,zone,û,ûïwhen,ûò,ûó,target.1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [8]:
print(array_train)
print(type(array_train))

array_train.shape

[[0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 1]
 ...
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 1]]
<class 'numpy.ndarray'>


(7613, 2468)

In [9]:
x = array_train[:,0:c-1]   # feature values
y = array_train[:,c-1]     # targets

x_train = x
y_train = y

print(x.shape)
print(y.shape)
print('')
print(x_train.shape)
print(y_train.shape)
print('')
print(x_train)
print(y_train)

(7613, 2467)
(7613,)

(7613, 2467)
(7613,)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[1 1 1 ... 1 1 1]


### **XGBoost: TUNE PARAMETERS**

In [10]:
num_folds = 5
seed = 11
scoring = 'f1'

model_xgb = XGBClassifier(colsample_bylevel=0.5, 
                          colsample_bytree=0.5, 
                          gamma=0, 
                          learning_rate=0.1, 
                          max_depth=8, 
                          min_child_weight=1.0, 
                          n_estimators=500, 
                          subsample=1)

#results = []
#names = []

In [11]:
start = time.time()
kfold = model_selection.KFold(n_splits=num_folds, random_state=seed, shuffle=True)
cv_results = model_selection.cross_val_score(model_xgb, x_train, y_train, cv=kfold, scoring=scoring)
elapsed_time = time.time() - start
    
#results.append(cv_results)
#names.append(name)
    
# print name, mean F1, standard deviation of accuracy, time taken
print(f'F1 (mean, std): \t {cv_results.mean()} \t {cv_results.std()} \t Time: {elapsed_time}')

Accuracy (mean, std): 	 0.7973192476637248 	 0.006967339520531227 	 Time: 90.09188580513


### **TRAIN MODEL ON ENTIRE TRAINING DATASET**

In [None]:
model_xgb.fit(x_train, y_train)

#predict_x = model_xgb.predict(x_validation)
#predict_round = [round(value) for value in predict_x]
#accuracy_x = metrics.accuracy_score(y_validation, predict_x)
#print(f'XGBoost Accuracy: {accuracy_x}')

### **KAGGLE TEST SET**

In [170]:
df_test = utils.csv_to_dataframe(DATA_FILE_TEST)
df_test.head(3)    

Unnamed: 0,kw_ablaze,kw_accident,kw_aftershock,kw_airplaneaccident,kw_ambulance,kw_annihilated,kw_annihilation,kw_apocalypse,kw_armageddon,kw_army,...,york,young,youth,youtub,yr,zone,û,ûïwhen,ûò,ûó
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [171]:
column_labels = list(range(0, c-1))
df_test.columns = column_labels
df_test.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1820,1821,1822,1823,1824,1825,1826,1827,1828,1829
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [172]:
array_test = df_test.values
x_test = array_test[:,0:c]   # feature values
print(type(x_test))
print(x_test.shape)

<class 'numpy.ndarray'>
(3263, 1830)


### **XGBoost PREDICTIONS**

In [173]:
predict_xgb = model_xgb.predict(x_test)
predict_xgb_round = [round(value) for value in predict_xgb]

In [175]:
predictions_list = predict_xgb_round

print(type(predictions_list))
print(len(predictions_list))
print(predictions_list[0:10])

<class 'list'>
3263
[1, 1, 1, 1, 1, 1, 0, 0, 0, 0]


### **PREPARE SUBMISSION FILE**

In [176]:
DATA_FILE_TEST_ID = os.path.join(DATA_DIR, "test_id.csv")

test_id_list = utils.csv_to_list_of_strings(DATA_FILE_TEST_ID)

print(len(test_id_list))
print(test_id_list[0:10])

3263
['0', '2', '3', '9', '11', '12', '21', '22', '27', '29']


In [177]:
df_test_id = utils.csv_to_dataframe(DATA_FILE_TEST_ID)
df_test_id.head(10)

Unnamed: 0,id
0,0
1,2
2,3
3,9
4,11
5,12
6,21
7,22
8,27
9,29


In [178]:
#predictions_list = predict_xgb_round

df_test_predict = pd.DataFrame({'col':predictions_list})
df_test_predict.columns = ['target']
df_test_predict.head(10)

Unnamed: 0,target
0,1
1,1
2,1
3,1
4,1
5,1
6,0
7,0
8,0
9,0


In [179]:
df_submit = pd.concat([df_test_id, df_test_predict], axis=1)
df_submit.head(10)

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
5,12,1
6,21,0
7,22,0
8,27,0
9,29,0


In [181]:
DATA_FILE_SUBMIT = os.path.join(DATA_DIR, "submission.csv")

utils.dataframe_to_csv(df_submit, DATA_FILE_SUBMIT)