# Submission Notebook

# Import Libraries

In [None]:
# Import numpy, pandas, and matplotlib using the standard aliases.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Import the following tools from sklearn: 
#     Pipeline, SimpleImputer, ColumnTransformer, OneHotEncoder, StandardScaler
#     LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier



from sklearn.model_selection import GridSearchCV
# Import joblib
import joblib
import os
import gc

# Import Notebook

In [None]:
test_iterator = pd.read_csv('../input/reducing-commercial-aviation-fatalities/test.csv', chunksize=5)
test_top = next(test_iterator)
test_top

In [None]:
submission = pd.read_csv("../input/reducing-commercial-aviation-fatalities/sample_submission.csv")
submission.sample(10)

# Decision Tree

In [None]:
train = pd.read_csv('../input/reducing-commercial-aviation-fatalities/train.csv')
train = train.sample(frac=1, random_state=1)
print(train.shape)

In [None]:
y_train = train.event.values
train.drop(['crew', 'experiment', 'time', 'seat', 'event'], axis=1, inplace=True)

In [None]:
x_train = train.iloc[:,0:27]
x_train.head()

In [None]:
train_idx, valid_idx = train_test_split(range(len(x_train)), test_size=0.8, random_state=1, stratify=y_train)

print(len(train_idx))
print(len(valid_idx))

indices = [(train_idx, valid_idx)]

In [None]:
%%time 

dt_clf = DecisionTreeClassifier(random_state=1)

dt_parameters = {
    'max_depth': [8,16, 24, 32, 40],
    'min_samples_leaf': [8, 16, 24, 32, 40]
}
#we do 3 max depth by 2 min samples = 6

dt_grid = GridSearchCV(dt_clf, dt_parameters, cv=indices, refit='True', n_jobs=-1, verbose=10, scoring='accuracy')
dt_grid.fit(x_train, y_train)
#tell you which fit it is on #10

dt_model = dt_grid.best_estimator_

print('Best Parameters:', dt_grid.best_params_)
print('Best CV Score:  ', dt_grid.best_score_)
print('Training Acc:   ', dt_model.score(x_train, y_train))

In [None]:
dt_model.predict_proba(x_train)

In [None]:
from sklearn.metrics import log_loss

log_loss(y_train, dt_model.predict_proba(x_train))

# Test

In [None]:
test_iterator = pd.read_csv('../input/reducing-commercial-aviation-fatalities/test.csv', chunksize=5)
test_top = next(test_iterator)
test_top

In [None]:
print(dt_model.predict_proba(test_top.iloc[:,5:]))

In [None]:
%%time

cs = 1000000
i = 0

for test in pd.read_csv('../input/reducing-commercial-aviation-fatalities/test.csv', chunksize=cs):
    
    print('--Iteration',i, 'is started')
    
    test_pred = dt_model.predict_proba(test.iloc[:,5:])
    
    partial_submission = pd.DataFrame({
        'id':test.id,
        'A':test_pred[:,0],
        'B':test_pred[:,1],
        'C':test_pred[:,2],
        'D':test_pred[:,3]
    })
    
    if i == 0:
        submission = partial_submission.copy()
    else:
        submission = submission.append(partial_submission, ignore_index=True)
        
    del test
    print('++Iteration', i, 'is done!')
    i +=1

In [None]:
submission.head()

In [None]:
submission.to_csv("submission.csv", index=False)