# Import Statements

In [None]:
# Import numpy, pandas, and matplotlib using the standard aliases.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Import the following tools from sklearn: 
#     Pipeline, SimpleImputer, ColumnTransformer, OneHotEncoder, StandardScaler
#     LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier



from sklearn.model_selection import GridSearchCV
# Import joblib
import joblib
import os
import gc

In [None]:
pd.set_option('max_columns', None)

# Load Training Data

In [None]:
train = pd.read_csv('../input/reducing-commercial-aviation-fatalities/train.csv')
train = train.sample(frac=1, random_state=1)
print(train.shape)
#28 columns
# 4,867,421 observations

In [None]:
mb = train.memory_usage(index=True, deep=True).sum() / 1024**2
print(mb)

In [None]:
train.head()

In [None]:
gc.collect()

# Reducing Memory Usage

In [None]:
def reduce_mem_usage(df):

    start_mem = df.memory_usage(index=True, deep=True).sum() / 1024**2
    print(f'Initial memory usage of dataframe is {start_mem:.2f} MB')
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage(index=True, deep=True).sum() / 1024**2
    reduction = 100 * (start_mem - end_mem) / start_mem
    print(f'Memory usage after optimization is: {end_mem:.2f} MB')
    print(f'Decreased by {reduction:.1f}%')
    
    return df

train = reduce_mem_usage(train)

# Check Missing Values

In [None]:
train.isnull().sum().to_frame().T
#there are not any missing values

# Check Label Distribution

In [None]:
(train.event.value_counts() / len(train)).to_frame()

In [None]:
#I look out the first 4 columns and the label we need
y_train = train.event.values
train.drop(['crew', 'experiment', 'time', 'seat', 'event'], axis=1, inplace=True)

In [None]:
#it is 0 instead of 4 because we took out the first 4 columns
x_train = train.iloc[:,0:27]
x_train.head()

In [None]:
gc.collect()

# Split The Data

In [None]:
train_idx, valid_idx = train_test_split(range(len(x_train)), test_size=0.8, random_state=1, stratify=y_train)

print(len(train_idx))
print(len(valid_idx))

indices = [(train_idx, valid_idx)]

# Model Selection

## Decision Tree

In [None]:
%%time 

dt_clf = DecisionTreeClassifier(random_state=1)

dt_parameters = {
    'max_depth': [8,16, 24, 32, 40],
    'min_samples_leaf': [8, 16, 24, 32, 40]
}
#we do 3 max depth by 2 min samples = 6

dt_grid = GridSearchCV(dt_clf, dt_parameters, cv=indices, refit='True', n_jobs=-1, verbose=10, scoring='accuracy')
dt_grid.fit(x_train, y_train)
#tell you which fit it is on #10

dt_model = dt_grid.best_estimator_

print('Best Parameters:', dt_grid.best_params_)
print('Best CV Score:  ', dt_grid.best_score_)
print('Training Acc:   ', dt_model.score(x_train, y_train))

In [None]:
dt_summary = pd.DataFrame(dt_grid.cv_results_['params'])
dt_summary['cv_score'] = dt_grid.cv_results_['mean_test_score']

for ms in dt_parameters['min_samples_leaf']:
    temp = dt_summary.query(f'min_samples_leaf == {ms}')
    plt.plot(temp.max_depth, temp.cv_score, label=ms)
plt.xlabel('Maximum Depth')
plt.ylabel('CV Score')
plt.legend(title='Min Samples')
plt.grid()
plt.show()

print(dt_summary.to_string(index=False))

## Random Forest

In [None]:
%%time 

rf_clf = RandomForestClassifier(random_state=1, n_estimators=50)

rf_parameters = {
    'max_depth': [8,16, 24, 32, 40],
    'min_samples_leaf': [8, 16, 24, 32, 40]
}

rf_grid = GridSearchCV(rf_clf, rf_parameters, cv=indices, refit='True', n_jobs=-1, verbose=0, scoring='accuracy')
rf_grid.fit(x_train, y_train)

rf_model = rf_grid.best_estimator_

print('Best Parameters:', rf_grid.best_params_)
print('Best CV Score:  ', rf_grid.best_score_)
print('Training Acc:   ', rf_model.score(x_train, y_train))

In [None]:
rf_summary = pd.DataFrame(rf_grid.cv_results_['params'])
rf_summary['cv_score'] = rf_grid.cv_results_['mean_test_score']

for ms in rf_parameters['min_samples_leaf']:
    temp = rf_summary.query(f'min_samples_leaf == {ms}')
    plt.plot(temp.max_depth, temp.cv_score, label=ms)
plt.xlabel('Maximum Depth')
plt.ylabel('CV Score')
plt.legend(title='Min Samples')
plt.grid()
plt.show()

print(rf_summary.to_string(index=False))

# Save Model

In [None]:
print(dt_grid.best_params_)
final_model = DecisionTreeClassifier(random_state=1, max_depth=40, min_samples_leaf=8)
final_model.fit(x_train, y_train)
print(final_model.score(x_train, y_train))
# 0.988669153541475
#Download both files to your local device and then upload them as a Kaggle dataset.
joblib.dump(final_model, 'aviation_model_01.joblib')
print('Model written to file.')