In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

In [23]:
aviation_data = '/Users/robertreynoso/Desktop/phase_03_project/data//encoded_aviation_data_3.csv'

df = pd.read_csv(aviation_data, encoding='latin-1')

In [24]:
df.columns

Index(['Unnamed: 0', 'target', 'location', 'aircraft_damage', 'make', 'model',
       'number_of_engines', 'engine_type', 'weather_conditions',
       'phase_of_flight', 'Year', 'Month', 'Day', 'injuries', 'pax_onboard',
       'fatality_percentage', 'survived', 'amateur_built'],
      dtype='object')

In [25]:
df = df.drop('Unnamed: 0', axis =1)


In [26]:
df.head()

Unnamed: 0,target,location,aircraft_damage,make,model,number_of_engines,engine_type,weather_conditions,phase_of_flight,Year,Month,Day,injuries,pax_onboard,fatality_percentage,survived,amateur_built
0,2,1725,0,2963,976,1,0,2,1,1982,6,13,1.0,1.0,0.0,1,0
1,2,7692,0,2963,3852,1,0,2,2,1982,7,1,2.0,2.0,0.0,2,1
2,0,4221,0,2963,6619,1,0,2,0,1982,7,16,1.0,1.0,100.0,0,1
3,2,12735,0,2963,4236,1,0,2,6,1982,8,21,0.0,1.0,0.0,1,1
4,2,11332,2,2963,5820,1,0,2,5,1982,8,24,0.0,1.0,0.0,1,1


## Spilt the data

In [27]:
# Split the predictor and target variables
y = df['target']
X = df.drop('target', axis=1)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## Standardize data

In [28]:
# Instantiate StandardScaler
scaler = StandardScaler()

# Transform the training and test sets
scaled_data_train = scaler.fit_transform(X_train)

scaled_data_test = scaler.transform(X_test)

# Convert into a DataFrame
scaled_df_train = pd.DataFrame(scaled_data_train, columns=X_train.columns)

scaled_df_train.head()

Unnamed: 0,location,aircraft_damage,make,model,number_of_engines,engine_type,weather_conditions,phase_of_flight,Year,Month,Day,injuries,pax_onboard,fatality_percentage,survived,amateur_built
0,0.627456,-1.63135,-1.001673,-0.835201,-0.354007,-0.347402,0.310273,-0.960724,-0.363939,-1.174099,1.614709,0.280682,-0.127755,2.258851,-0.211092,-0.298042
1,-0.96372,0.636283,-0.702078,-1.099189,-0.354007,-0.347402,0.310273,-0.960724,0.693717,0.465575,-1.323534,0.280682,-0.127755,-0.467773,-0.117392,-0.298042
2,1.029071,-1.63135,1.230538,1.09542,-0.354007,-0.347402,-1.51914,-0.960724,0.693717,-0.846164,0.032578,0.548594,-0.086872,2.258851,-0.211092,-0.298042
3,-1.564514,0.636283,0.245505,1.847671,-0.354007,3.020562,0.310273,0.33707,0.341165,0.465575,-0.080431,0.012771,-0.168638,-0.467773,-0.164242,-0.298042
4,-1.623642,0.636283,0.07982,0.395033,-0.354007,-0.347402,0.310273,0.33707,0.164889,0.465575,0.032578,-0.255141,-0.168638,-0.467773,-0.164242,-0.298042


In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import time
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score


In [30]:
def evaluate(model, name):
    
    output = {'model': name}
    start1 = time.time()
    model.fit(x_train, y_train)
    traintime = time.time() - start1
    
    # training metrics
    
    trainpred = model.predict(x_train)
    output['train_precision'] = precision_score(y_train, trainpred)
    output['train_recall'] = recall_score(y_train, trainpred)
    output['train_accuracy'] = accuracy_score(y_train, trainpred)
    output['train_f1'] = f1_score(y_train, trainpred)
    output['train_time'] = traintime
    
    # testing metrics
    
    start2 = time.time()
    pred = model.predict(x_test)
    testtime = time.time() - start2
    
    output['test_precision'] = precision_score(y_test, pred)
    output['test_recall'] = recall_score(y_test, pred)
    output['test_accuracy'] = accuracy_score(y_test, pred)
    output['test_f1'] = f1_score(y_test, pred)
    output['test_time'] = testtime
    
    # confusion matrix for test set
    
    conf = pd.crosstab(y_test, pred)
    
    return output, conf

## a. K Nearest Neighbors

In [31]:
# find optimal k 
def find_best_k(X_train, y_train, X_test, y_test, min_k=1, max_k=25):
    best_k = 0
    best_score = 0.0
    for k in range(min_k, max_k+1, 2):
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        preds = knn.predict(X_test)
        f1 = f1_score(y_test, preds)
        if f1 > best_score:
            best_k = k
            best_score = f1
    
    print("Best Value for k: {}".format(best_k))
    print("F1-Score: {}".format(best_score))

In [32]:
find_best_k(X_train, y_train, X_test, y_test)


ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

In [33]:
print(y_train.value_counts())
print(y_test.value_counts())

2    28440
0     6909
1     1154
Name: target, dtype: int64
2    9554
0    2241
1     373
Name: target, dtype: int64


In [34]:
from pandasgui import show
gui  = show(df)