# Submission Notebook

# Import Libraries

In [None]:
# Import numpy, pandas, and matplotlib using the standard aliases.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Import the following tools from sklearn: 
#     Pipeline, SimpleImputer, ColumnTransformer, OneHotEncoder, StandardScaler
#     LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier



from sklearn.model_selection import GridSearchCV
# Import joblib
import joblib
import os
import gc

# Load Training Data

In [None]:
train = pd.read_csv('../input/reducing-commercial-aviation-fatalities/train.csv')
train = train.sample(frac=1, random_state=1)
print(train.shape)

In [None]:
y_train = train.event.values
train.drop(['crew', 'experiment', 'time', 'seat', 'event'], axis=1, inplace=True)

In [None]:
x_train = train.iloc[:,0:27]
x_train.head()

# Train Model

In [None]:
dt_clf = DecisionTreeClassifier(random_state=1, max_depth = 192, min_samples_leaf = 155)
dt_clf.fit(x_train, y_train)
dt_clf.score(x_train, y_train)

# Test

In [None]:
%%time

cs = 1000000
i = 0

for test in pd.read_csv('../input/reducing-commercial-aviation-fatalities/test.csv', chunksize=cs):
    
    print('--Iteration',i, 'is started')
    
    test_pred = dt_clf.predict_proba(test.iloc[:,5:])
    
    partial_submission = pd.DataFrame({
        'id':test.id,
        'A':test_pred[:,0],
        'B':test_pred[:,1],
        'C':test_pred[:,2],
        'D':test_pred[:,3]
    })
    
    if i == 0:
        submission = partial_submission.copy()
    else:
        submission = submission.append(partial_submission, ignore_index=True)
        
    del test
    print('++Iteration', i, 'is done!')
    i +=1

In [None]:
plt.figure(figsize=[8,4])
for i in range(4):
    plt.subplot(2,2,i+1)
    plt.hist(submission.iloc[:,i+1], bins=20,edgecolor='k')
plt.tight_layout()
plt.show()

In [None]:
submission.head()

In [None]:
submission.to_csv("submission.csv", index=False)