# Mini-Lab: Logistic Regression and SVMs

Names:
Dylan Scott
Jobin Joseph
Nnenna Okpara
Satvik Ajmera

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='darkgrid')

import plotly.express as px
import plotly.graph_objects as go

### Dataset add-on
From the first project we submitted we have since added on more data that we found on the NTSB website. We were able to merge in new columns using join as well as apend on more recent data. This will give us more vairables but we will have to clean up some of those added rows. This next section will be the clean up.

In [2]:
#Read in the Aviation Data
final_data = pd.read_csv("Data/final_data.csv",low_memory=False,dtype={'damage': str})
final_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115706 entries, 0 to 115705
Data columns (total 35 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         115706 non-null  int64  
 1   index              115706 non-null  int64  
 2   ev_id              115706 non-null  object 
 3   ntsb_no_x          115706 non-null  object 
 4   acft_make          115643 non-null  object 
 5   acft_model         115630 non-null  object 
 6   cert_max_gr_wt     98673 non-null   float64
 7   acft_category      115287 non-null  object 
 8   damage             113877 non-null  object 
 9   far_part           114925 non-null  object 
 10  afm_hrs_last_insp  60298 non-null   float64
 11  type_fly           108599 non-null  object 
 12  dprt_city          111864 non-null  object 
 13  dprt_state         108791 non-null  object 
 14  dprt_state.1       108791 non-null  object 
 15  rwy_len            64222 non-null   float64
 16  rw

# Checking Data Cleaning

In [3]:
#It looks like we have some missing values and have an inconsistant UNK vs UNK on flight damage
finaldamagecount = final_data["damage"].value_counts().reset_index()
finaldamagecount.head(50)


Unnamed: 0,index,damage
0,SUBS,87994
1,DEST,20892
2,MINR,3302
3,NONE,1600
4,UNK,45
5,UNK,44


In [4]:
final_data.loc[final_data['damage'].str.contains('UNK', na=False), 'damage'] = 'Unknown'
finaldamagecount = final_data["damage"].value_counts().reset_index()
finaldamagecount.head(50)

Unnamed: 0,index,damage
0,SUBS,87994
1,DEST,20892
2,MINR,3302
3,NONE,1600
4,Unknown,89


In [5]:
#rename the injuries columns to make them easier to read
final_data = final_data.rename(columns={"inj_tot_f": "Total.Fatal.Injuries", "inj_tot_s": "Total.Serious.Injuries","inj_tot_m":"Total.Minor.Injuries","inj_tot_n":'Total.Uninjured',"inj_tot_t":"Total.Injuries"})

#fill in 0s when there wasn't an injury in that category
final_data.update(final_data[['Total.Fatal.Injuries','Total.Serious.Injuries','Total.Minor.Injuries','Total.Uninjured','Total.Injuries']].fillna(0))
final_data.head()

Unnamed: 0.1,Unnamed: 0,index,ev_id,ntsb_no_x,acft_make,acft_model,cert_max_gr_wt,acft_category,damage,far_part,...,Total.Fatal.Injuries,Total.Minor.Injuries,Total.Uninjured,Total.Serious.Injuries,Total.Injuries,sky_cond_ceil,sky_cond_nonceil,wind_vel_ind,wx_int_precip,phase_flt_spec
0,0,0,20001204X00000,ANC99FA021,Cessna,207,3800.0,AIR,SUBS,135,...,0.0,1.0,0.0,0.0,1.0,BKN,UNK,UNK,UNK,Approach
1,1,1,20001204X00001,ANC99IA025,Boeing,747-100,750000.0,AIR,MINR,121,...,0.0,0.0,4.0,0.0,0.0,NONE,SCAT,CALM,UNK,Landing
2,2,2,20001204X00002,ANC99LA020,Piper,PA-31-350,7369.0,AIR,SUBS,135,...,0.0,0.0,6.0,0.0,0.0,OVC,SCAT,UNK,UNK,Unknown
3,3,3,20001204X00003,ANC99LA022,Cessna,172,2300.0,AIR,SUBS,91,...,0.0,0.0,1.0,0.0,0.0,BKN,UNK,UNK,LGT,Unknown
4,4,4,20001204X00004,ANC99LA023,Cessna,207,3800.0,AIR,SUBS,135,...,0.0,0.0,1.0,0.0,0.0,BKN,UNK,UNK,UNK,Descent


In [6]:
#set missing variables to Unknown in order to run our models
final_data.update(final_data.fillna("Unknown"))
final_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115706 entries, 0 to 115705
Data columns (total 35 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Unnamed: 0              115706 non-null  int64  
 1   index                   115706 non-null  int64  
 2   ev_id                   115706 non-null  object 
 3   ntsb_no_x               115706 non-null  object 
 4   acft_make               115706 non-null  object 
 5   acft_model              115706 non-null  object 
 6   cert_max_gr_wt          115706 non-null  object 
 7   acft_category           115706 non-null  object 
 8   damage                  115706 non-null  object 
 9   far_part                115706 non-null  object 
 10  afm_hrs_last_insp       115706 non-null  object 
 11  type_fly                115706 non-null  object 
 12  dprt_city               115706 non-null  object 
 13  dprt_state              115706 non-null  object 
 14  dprt_state.1        

We will be using code from this classes Github: 
https://github.com/jakemdrew/DataMiningNotebooks/blob/master/04.%20Logits%20and%20SVM.ipynb

In [7]:
#drop not needed columns: index, ev_id ntsb_no_x
del final_data["index"]
del final_data["ev_id"]
del final_data["ntsb_no_x"]
#final_data.info()

In [10]:
#injuries = final_data["Total.Injuries"].value_counts().reset_index()
#injuries.head(50)

In [11]:
#create a new column of injuried or not to get a binary response
#1 means someone was hurt 0 means someone was not
final_data['Injury'] = np.where(final_data['Total.Injuries'] >0,1,0)
injuries = final_data["Injury"].value_counts().reset_index()
injuries.head(50)

Unnamed: 0,index,Injury
0,1,79858
1,0,35848


# Train Test Split

In [12]:
from sklearn.model_selection import ShuffleSplit

# we want to predict the X and y data as follows:
if 'Injury' in final_data:
    y = final_data['Injury'].values # get the labels we want
    del final_data['Injury'] # get rid of the class label
    X = final_data.values # use everything else to predict!

    ## X and y are now numpy matrices, by calling 'values' on the pandas data frames we
    #    have converted them into simple matrices to use with scikit learn
    
    
# to use the cross validation object in scikit learn, we need to grab an instance
#    of the object and set it up. This object will be able to split our data into 
#    training and testing splits
num_cv_iterations = 3
num_instances = len(y)
cv_object = ShuffleSplit(n_splits=num_cv_iterations,
                         test_size  = 0.2)
                         
print(cv_object)

ShuffleSplit(n_splits=3, random_state=None, test_size=0.2, train_size=None)
