# Notebook for feature engineering

# Import necessary libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import optuna 
import optuna.visualization as optvis
from sklearn.model_selection import cross_val_score

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import joblib

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [None]:
def opt_plot(study, plot):
    if plot == 0: return optvis.plot_optimization_history(study)
    if plot == 1: return optvis.plot_slice(study)
    if plot == 2: return optvis.plot_parallel_coordinate(study)
    if plot == 3: return optvis.plot_contour(study)
    if plot == 4: return optvis.plot_param_importances(study)

# Import Training Data

In [None]:
train = pd.read_csv('../input/reducing-commercial-aviation-fatalities/train.csv')
#train = train.sample(frac=0.01)
print(train.shape)

In [None]:
train.isna().sum(axis=0).to_frame().T


# Feature Engineering

I am going to feature engineer three of the montages from the file example that I saw.
https://www.kaggle.com/code/stuartbman/introduction-to-physiological-data

## Montage 1

In [None]:
def add_montage_1(df):
    df['f7_f8'] = df['eeg_f7'] - df['eeg_f8']
    df['f3_f4'] = df['eeg_f3'] - df['eeg_f4']
    df['t3_t4'] = df['eeg_t3'] - df['eeg_t4']
    df['c3_c4'] = df['eeg_c3'] - df['eeg_c4']
    df['t5_t6'] = df['eeg_t5'] - df['eeg_t6']
    df['o1_o2'] = df['eeg_o1'] - df['eeg_o2']
    return df

## Montage 2

In [None]:
def add_montage_2(df):
    df['fp1_f7'] = df['eeg_fp1'] - df['eeg_f7']
    df['f7_t3'] = df['eeg_f7'] - df['eeg_t3']
    df['t3_t5'] = df['eeg_t3'] - df['eeg_t5']
    df['t5_o1'] = df['eeg_t5'] - df['eeg_o1']
    
    df['fp1_f3'] = df['eeg_fp1'] - df['eeg_f3']
    df['f3_c3'] = df['eeg_f3'] - df['eeg_c3']
    df['c3_p3'] = df['eeg_c3'] - df['eeg_p3']
    df['p3_o1'] = df['eeg_p3'] - df['eeg_o1']
    
    df['fz_cz'] = df['eeg_fz'] - df['eeg_cz']
    df['cz_pz'] = df['eeg_cz'] - df['eeg_pz']
    
    df['fp2_f8'] = df['eeg_fp2'] - df['eeg_f8']
    df['f8_t4'] = df['eeg_f8'] - df['eeg_t4']
    df['t4_t6'] = df['eeg_t4'] - df['eeg_t6']
    df['t6_o2'] = df['eeg_t6'] - df['eeg_o2']
    
    df['fp2_f4'] = df['eeg_fp2'] - df['eeg_f4']
    df['f4_c4'] = df['eeg_f4'] - df['eeg_c4']
    df['c4_p4'] = df['eeg_c4'] - df['eeg_p4']
    df['p4_o2'] = df['eeg_p4'] - df['eeg_o2']
    
    return df


## Montage 3

In [None]:
def add_montage_3(df):
    df['fp1_fp2'] = df['eeg_fp1'] - df['eeg_fp2']
    df['f7_f3'] = df['eeg_f7'] - df['eeg_f3']
    df['f3_fz'] = df['eeg_f3'] - df['eeg_fz']
    df['fz_f4'] = df['eeg_fz'] - df['eeg_f4']
    df['f4_f8'] = df['eeg_f4'] - df['eeg_f8']
    
    df['t3_c3'] = df['eeg_t3'] - df['eeg_c3']
    df['c3_cz'] = df['eeg_c3'] - df['eeg_cz']
    df['cz_c4'] = df['eeg_cz'] - df['eeg_c4']
    df['c4_t4'] = df['eeg_c4'] - df['eeg_t4']
    
    df['t5_p3'] = df['eeg_t5'] - df['eeg_p3']
    df['p3_pz'] = df['eeg_p3'] - df['eeg_pz']
    df['pz_p4'] = df['eeg_pz'] - df['eeg_p4']
    df['p4_t6'] = df['eeg_p4'] - df['eeg_t6']
    
    df['o1_o2'] = df['eeg_o1'] - df['eeg_o2']
    
    return df

## Add New Features

In [None]:
train = add_montage_1(train)
train = add_montage_2(train)
train = add_montage_3(train)


In [None]:
Original_Features = train.columns[4:27].to_list()
Other_Features = train.columns[24:27].to_list()
Montage1 = ['f7_f8','f3_f4','t3_t4','c3_c4','t5_t6','o1_o2']
Montage2 = ['fp1_f7', 'f7_t3', 't3_t5', 't5_o1', 'fp1_f3', 'f3_c3', 'c3_p3', 'p3_o1', 'fz_cz', 'cz_pz', 'fp2_f8', 'f8_t4', 't4_t6', 't6_o2', 'fp2_f4', 'f4_c4', 'c4_p4', 'p4_o2']
Montage3 = ['fp1_fp2','t3_c3','f3_fz','fz_f4','f4_f8','t3_c3','c3_cz','cz_c4','c4_t4','t5_p3','p3_pz','pz_p4','p4_t6','o1_o2']

In [None]:
selected_columns = Other_Features + Montage1 + Montage2 + Montage3

In [None]:
x_train = train[selected_columns]

In [None]:
y_train = train.event.values
train.drop(['crew', 'experiment', 'time', 'seat', 'event'], axis=1, inplace=True)

In [None]:
del train

# Reduce Memory Usage

def reduce_mem_usage(df):

    start_mem = df.memory_usage(index=True, deep=True).sum() / 1024**2
    print(f'Initial memory usage of dataframe is {start_mem:.2f} MB')
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage(index=True, deep=True).sum() / 1024**2
    reduction = 100 * (start_mem - end_mem) / start_mem
    print(f'Memory usage after optimization is: {end_mem:.2f} MB')
    print(f'Decreased by {reduction:.1f}%')
    
    return df

x_train = reduce_mem_usage(x_train)

# Montage Function

In [None]:
print('x_train shape:', x_train.shape)
print('y_train shape:', y_train.shape)

# Train Model

In [None]:
%%time
ex_clf =  ExtraTreesClassifier(random_state=1, max_depth=96, min_samples_leaf=1, n_estimators=115,criterion='entropy')
ex_clf.fit(x_train, y_train)
ex_clf.score(x_train, y_train)

# Test

In [None]:
%%time

cs = 100
#1000000
i = 0

for test in pd.read_csv('../input/reducing-commercial-aviation-fatalities/test.csv', chunksize=cs):
    #print(test.shape)
    #print(test.columns)
    
    test = add_montage_1(test)
    test = add_montage_2(test)
    test = add_montage_3(test)

    x_test = test[selected_columns]
    print(test.shape)
    
    print('--Iteration',i, 'is started')
    
    test_pred = ex_clf.predict_proba(x_test)
    
    partial_submission = pd.DataFrame({
        'id':test.id,
        'A':test_pred[:,0],
        'B':test_pred[:,1],
        'C':test_pred[:,2],
        'D':test_pred[:,3]
    })
    
    if i == 0:
        submission = partial_submission.copy()
    else:
        submission = submission.append(partial_submission, ignore_index=True)
        
    del test
    print('++Iteration', i, 'is done!')
    i +=1

In [None]:
plt.figure(figsize=[8,4])
for i in range(4):
    plt.subplot(2,2,i+1)
    plt.hist(submission.iloc[:,i+1], bins=20,edgecolor='k')
plt.tight_layout()
plt.show()

In [None]:
submission.head()

In [None]:
submission.to_csv("submission.csv", index=False)