In [None]:
!pip install biosppy

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from biosppy.signals import ecg, resp, eeg, eda
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [None]:
test = pd.read_csv("../input/reducing-commercial-aviation-fatalities/test.csv")
train = pd.read_csv("../input/reducing-commercial-aviation-fatalities/train.csv")

# Reduce memory usage

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


In [None]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

# Explore training data

In [None]:
train.head(5)

In [None]:
pd.crosstab(train['experiment'], train['event'])

For any experiment conducted, pilot crews mental state will fall into either the state in the experinent or the baseline state.

In [None]:
pd.crosstab(train['experiment'], train['crew'])

All crews (except crew 1) are exposed to a relatively same number of each experiment.

In [None]:
pd.crosstab(train['event'], train['crew'])

There is a difference in the total number of each states among each crew but not so much

In [None]:
pd.crosstab(train['event'], train['seat'])

The number of events are quite evenly distributed in each seat.

In [None]:
crew = 13
seat = 1
exp = 'SS'
ev = 'B'

sel = (train.crew == crew) & (train.experiment == exp) & (train.seat == seat)
pilot_info = train.loc[sel,:].sort_values(by='time')


plt.figure(figsize=[16,12])
for i in range(4, 27):
    plt.subplot(6,4,i-3)
    plt.plot(pilot_info.time, 
             pilot_info.iloc[:,i], zorder=1)
    plt.scatter(pilot_info.loc[pilot_info.event ==  ev,:].time, 
             pilot_info.loc[pilot_info.event == ev,:].iloc[:,i], c='red', zorder=2, s=1)
    plt.title(pilot_info.columns[i])

plt.tight_layout()
plt.show()

# Explore and filtering noise from data

## Respiration

In [None]:
# Visualize data:
subset = train.loc[(train['crew'] == 4) & (train['seat'] ==1) & (train['experiment']== 'CA')]

subset.sort_values(by='time')


# Show the plot
plt.plot(subset['r'])
plt.show()

##### There is noise in the data. We will use Biosppy library to filter data and count respiration rate

In [None]:
reps_rate = resp.resp(signal = subset["r"],sampling_rate=256, show=False)

plt.plot(reps_rate['resp_rate_ts'], reps_rate['resp_rate'])
plt.ylabel('Respiratory frequency [Hz]')
plt.xlabel('Time [s]')
plt.show()

## ECG

In [None]:
plt.plot(subset['ecg'])
plt.show()

In [None]:
heart_rate = ecg.ecg(signal = subset["ecg"].values,sampling_rate=256, show=False)

plt.plot(heart_rate['heart_rate_ts'], heart_rate['heart_rate'])
plt.ylabel('Heart Rate (BPM)')
plt.xlabel('Time [s]')
plt.show()

## EEG

In [None]:
eeg_features = ["eeg_fp1", "eeg_f7", "eeg_f8", "eeg_t4", "eeg_t6", "eeg_t5", "eeg_t3", 
                "eeg_fp2", "eeg_o1", "eeg_p3", "eeg_pz", "eeg_f3", "eeg_fz", "eeg_f4",
                "eeg_c4", "eeg_p4", "eeg_poz", "eeg_c3", "eeg_cz", "eeg_o2"]

eeg_out = eeg.get_power_features(signal=subset[eeg_features].values, sampling_rate=256.)
plt.plot(subset['eeg_fp1'].values)
plt.title('raw eeg fp1')
plt.show()

In [None]:
plt.figure()
plt.plot(eeg_out['ts'], eeg_out['theta'][:,0])
plt.title('theta eeg fp1')
plt.figure()
plt.plot(eeg_out['ts'], eeg_out['alpha_low'][:,0])
plt.title('alpha_low eeg fp1')
plt.figure()
plt.plot(eeg_out['ts'], eeg_out['alpha_high'][:,0])
plt.title('alpha_high eeg fp1')
plt.figure()
plt.plot(eeg_out['ts'], eeg_out['beta'][:,0])
plt.title('beta eeg fp1')
plt.figure()
plt.plot(eeg_out['ts'], eeg_out['gamma'][:,0])
plt.title('gamma eeg fp1')
plt.show()

# Creating features

In [None]:
train_resp_rate = resp.resp(signal = train["r"], show=False)
train_resp_rate1 = train_resp_rate['resp_rate']
train_resp_rate2 = train_resp_rate['resp_rate_ts']

print(len(train_resp_rate1))
print(len(train["time"]))

In [None]:
from scipy.interpolate import interp1d

def map_timestamped(df_times, new_feature_ts, new_feature_data):
    f = interp1d(new_feature_ts, new_feature_data, kind='cubic', fill_value="extrapolate")
    return f(df_times)

In [None]:
def create_new_df_col(df, sensor_list, time):
    df['crew'] = df['crew'].astype(str)
    df['seat'] = df['seat'].astype(str)
    df['pilot_id'] = df['crew'] + df['seat']
    df['pilot_id'] = df['pilot_id'].astype(int)
    
    df['f7_f8'] = df['eeg_f7'] - df['eeg_f8']
    df['f3_f4'] = df['eeg_f3'] - df['eeg_f4']
    df['t3_t4'] = df['eeg_t3'] - df['eeg_t4']
    df['c3_c4'] = df['eeg_c3'] - df['eeg_c4']
    df['p3_p4'] = df['eeg_p3'] - df['eeg_p4']
    df['t5_t6'] = df['eeg_t5'] - df['eeg_t6']
    df['o1_o2'] = df['eeg_o1'] - df['eeg_o2']
    
    if 'r' in sensor_list:
        df_resp = resp.resp(signal = df['r'], show = False)
        df_resp_rate = df_resp['resp_rate']
        df_resp_rate_ts = df_resp['resp_rate_ts']
        df['resp_rate'] = map_timestamped(time, df_resp_rate_ts, df_resp_rate)
    
    if 'ecg'in sensor_list:
        df_heart = ecg.ecg(signal = df['ecg'], show = False)
        df_heart_rate = df_heart['heart_rate']
        df_heart_rate_ts = df_heart['heart_rate_ts']
        df['heart_rate'] = map_timestamped(time, df_heart_rate_ts, df_heart_rate)
        
    if 'eeg' in sensor_list:
        eeg_features = ["eeg_fp1", "eeg_f7", "eeg_f8", "eeg_t4", "eeg_t6", "eeg_t5",
                        "eeg_t3", "eeg_fp2", "eeg_o1", "eeg_p3", "eeg_pz", "eeg_f3", 
                        "eeg_fz", "eeg_f4", "eeg_c4", "eeg_p4", "eeg_poz", "eeg_c3", 
                        "eeg_cz", "eeg_o2"]
        df_eeg = eeg.get_power_features(signal=df[eeg_features].values, sampling_rate=256.)
        ts = df_eeg['ts']
        theta = df_eeg['theta']
        alpha_low = df_eeg['alpha_low']
        alpha_high = df_eeg['alpha_high']
        beta = df_eeg['beta']
        gamma = df_eeg['gamma']
        for i,elt in enumerate (eeg_features):
            df[elt + '_theta'] = map_timestamped(df['time'], ts, theta[:,i])
            df[elt + '_alpha_low'] = map_timestamped(df['time'], ts, alpha_low[:,i])
            df[elt + '_alpha_high'] = map_timestamped(df['time'], ts, alpha_high[:,i])
            df[elt + '_beta'] = map_timestamped(df['time'], ts, beta[:,i])
            df[elt + '_gamma'] = map_timestamped(df['time'], ts, gamma[:,i])
       
    return df

In [None]:
train = create_new_df_col(train, ['r','ecg'], train['time'])
train.head()

In [None]:
%%time
test = create_new_df_col(test, ['r', 'ecg'], test['time'])

test.head()

# Create training and test set

In [None]:
training_data = train.drop( ['crew', 'seat', 'experiment', 'r',
                                              'ecg', 'event' ], 1)
training_label = train.event
test_data = test.drop( ['id','crew', 'seat', 'experiment', 'r', 'ecg'],1)
test_id = test.id
test_data.head()

In [None]:
training_data.head()

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(training_data, training_label,
                                     test_size=0.10, stratify= training_label, random_state=1)

In [None]:
pilot_list = sorted(X_train['pilot_id'].unique())
print(pilot_list)

# Gradient Boosting

In [None]:
%%time

model_list = []
for i, pilot in enumerate (pilot_list):
    
    X_tr = X_train[X_train['pilot_id']== pilot].drop('pilot_id', axis = 1)
    y_tr = y_train[X_train['pilot_id'] == pilot]
    X_val = X_valid[X_valid['pilot_id'] == pilot].drop('pilot_id', axis = 1)
    y_val = y_valid[X_valid['pilot_id'] == pilot]
    
    gb_mod = XGBClassifier(learning_rate=0.3, max_depth=6, alpha=1, n_estimators=50, subsample=0.5)
    model_list.append(gb_mod)
    
    gb_mod.fit(X_tr, y_tr)
    print('Training Accuracy of pilot   ' + str(pilot), gb_mod.score(X_tr, y_tr))
    print('Validation Accuracy of pilot ' + str(pilot), gb_mod.score(X_val, y_val))
    print('Log loss of pilot            ' + str(pilot), log_loss(y_val, gb_mod.predict_proba(X_val)))
    print('')

In [None]:
%%time
all_results = np.zeros((test_id.shape[0], 5))
all_results[:, 0] = test_id

for i, pilot in enumerate (pilot_list):
    pilot_model = model_list[i]
    
    X_test = test_data[test_data['pilot_id']== pilot].drop(['pilot_id'], axis =1)
    pilot_results = pilot_model.predict_proba(X_test)
    print('Pilot '+ str(pilot) + ' Done')
    
    all_results[test_data['pilot_id']== pilot, 1:5] = pilot_results

In [None]:
submission = pd.DataFrame(all_results, columns=['id', 'A', 'B', 'C', 'D'])
submission['id'] = submission['id'].astype(int)

submission.sample(10)

In [None]:
submission.to_csv("submission.csv", index=False)