# Notebook for testing out Logestic Regression

In [78]:
#Import data cleaning library
import pandas as pd
import numpy as np

#Import Machine Learning Libraries
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

#Import the plotting libraries
import seaborn as sns
import matplotlib.pyplot as plt
import math

In [79]:
#Find the correlation between features in the encoded dataset
"""#Read the Trips Data Set w/o missing values
df = pd.read_csv("catEncodedDataset.csv")
pd.options.display.max_columns = None
display(df.head())
#Create a correlation matrix between the features
correlation_df = df.corr()
correlation_df
#Create a correlation heatmap for better visualization
plt.figure(figsize=(20, 20))
sns.heatmap(correlation_df)
#Create a correlation heatmap for just the household income
plt.figure(figsize=(8, 12))
heatmap = sns.heatmap(correlation_df[['hh_income']].sort_values(by='hh_income', ascending=False), vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Features Correlating with Household Income', fontdict={'fontsize':18}, pad=16);"""

'#Read the Trips Data Set w/o missing values\ndf = pd.read_csv("catEncodedDataset.csv")\npd.options.display.max_columns = None\ndisplay(df.head())\n#Create a correlation matrix between the features\ncorrelation_df = df.corr()\ncorrelation_df\n#Create a correlation heatmap for better visualization\nplt.figure(figsize=(20, 20))\nsns.heatmap(correlation_df)\n#Create a correlation heatmap for just the household income\nplt.figure(figsize=(8, 12))\nheatmap = sns.heatmap(correlation_df[[\'hh_income\']].sort_values(by=\'hh_income\', ascending=False), vmin=-1, vmax=1, annot=True, cmap=\'BrBG\')\nheatmap.set_title(\'Features Correlating with Household Income\', fontdict={\'fontsize\':18}, pad=16);'

**Encoding Format for the nominal variables**
- trip_purp:['HBW', 'HBD', 'Non-HB']
- start_time:['Peak', 'Off_peak']
- sex:['F', 'M']
- driver_lic:['Y', 'N']
- tran_pass:['N', 'Other_agency', 'Y']
- emp_stat:['Employed', 'Not_employed', 'Work_at_Home']
- occupation:['Retail&Service', 'Not_employed', 'General_Office', 'Manufacturing']
- free_park:['Y', 'Not Applicable', 'N']
- stu_stat:['Not_student', 'Student']
- hh_dwell_type:['House', 'Townhouse', 'Apartment']
- mode_prime:['Drive', 'Other', 'Passenger', 'Transit', 'Walk', 'Bicycle']

In [80]:
#Read the Trips Data Set and explore the data
df_missing = pd.read_csv("TripsWithMissing.csv")
pd.options.display.max_columns = None
display(df_missing.tail())

Unnamed: 0.1,Unnamed: 0,hhld_num,pers_num,trip_num,trip_purp,start_time,mode_prime,trip_man_km,waterzn_orig,expf,age,sex,driver_lic,tran_pass,emp_stat,occupation,free_park,stu_stat,n_pers_trip,hh_dwell_type,hh_income,hh_size,hh_n_vehs
45203,43681,5508317,1,1,HBW,Peak,Drive,"(5.0, 6.0]",120674,16,"(65.0, 72.0]",F,Y,N,Employed,General_Office,Y,Not_student,"(0.999, 2.0]",House,60k-99k,"(1.0, 2.0]","(1.0, 2.0]"
45204,6721,5508822,2,1,HBD,Peak,Passenger,"(11.0, 16.0]",121537,10,"(41.0, 47.0]",F,N,N,Not_employed,Not_employed,Not Applicable,Not_student,"(0.999, 2.0]",House,15k-39k,"(1.0, 2.0]","(0.0, 1.0]"
45205,6722,5508822,2,2,HBD,Peak,Passenger,"(11.0, 16.0]",120322,10,"(41.0, 47.0]",F,N,N,Not_employed,Not_employed,Not Applicable,Not_student,"(0.999, 2.0]",House,15k-39k,"(1.0, 2.0]","(0.0, 1.0]"
45206,2874,5508943,1,1,HBW,Peak,Passenger,"(11.0, 16.0]",120677,8,"(53.0, 58.0]",F,N,N,Employed,Manufacturing,Y,Not_student,"(0.999, 2.0]",House,15k-39k,"(2.0, 3.0]","(-0.01, 0.0]"
45207,38033,5508943,1,2,HBW,Peak,Passenger,"(11.0, 16.0]",121631,8,"(53.0, 58.0]",F,N,N,Employed,Manufacturing,Y,Not_student,"(0.999, 2.0]",House,15k-39k,"(2.0, 3.0]","(-0.01, 0.0]"


In [81]:
%%time
#Add is Missing Column
income_missing = pd.DataFrame(columns=['NotMissing'])
for i in df_missing['hh_income']:
    if i is np.nan:
        #income_missing = pd.concat([pd.DataFrame([0], columns=['Missing'])],
          #ignore_index=True)
        income_missing = income_missing.append({'NotMissing': 0}, ignore_index=True)
    else:
        income_missing = income_missing.append({'NotMissing':1}, ignore_index=True)
    #print(math.isnan(i))
    #income_missing


CPU times: user 1min 53s, sys: 69.8 ms, total: 1min 53s
Wall time: 1min 53s


In [83]:
# Import the OneHotEncoder Class to encode the nominal encoder using the dummy variable method
from sklearn.preprocessing import OneHotEncoder

#Seperate the nominal features needeed to be encoded
df_to_encode = df_missing[['age', 'n_pers_trip', 'trip_man_km', 'hh_size', 'hh_n_vehs','trip_purp', 'start_time', 'sex', 'driver_lic','tran_pass', 'emp_stat', 'occupation', 'free_park', 'stu_stat', 'hh_dwell_type', 'mode_prime']]

In [84]:
#drop variables which do not need to be "none filled" aka do not have any missing values
none_filling = df_to_encode.drop(['trip_man_km', 'trip_purp', 'start_time', 'mode_prime'], axis=1)

In [85]:
%%time
df_filled = none_filling
for i in range(none_filling.shape[1]):
    print(i)
    for j in range(none_filling.shape[0]):
        if df_filled.iat[j,i] is np.nan:
            print("None_{}".format(df_filled.columns[i]))
            df_filled.iat[j,i] = "None_{}".format(df_filled.columns[i])
            #df_filled = df_filled.append({i:j}, ignore_index = True)  
            
print(df_filled.head())

0
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age
None_age

None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_trip
None_n_pers_tr

None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_vehs
None_hh_n_

None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
None_sex
N

None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_driver_lic
None_dri

None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_pass
None_tran_

None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_emp_stat
None_e

None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occupation
None_occ

None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_

None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_

None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_

None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_

None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_park
None_free_

None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_stu_stat
None_s

None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwell_type
None_hh_dwel

In [87]:
#Join all the nominal, ordinal and other relevant features in a single dataframe
df_filled = df_filled.join(df_to_encode[['trip_man_km', 'trip_purp', 'start_time', 'mode_prime']])

In [88]:
df_filled.head()

Unnamed: 0,age,n_pers_trip,hh_size,hh_n_vehs,sex,driver_lic,tran_pass,emp_stat,occupation,free_park,stu_stat,hh_dwell_type,trip_man_km,trip_purp,start_time,mode_prime
0,"(41.0, 47.0]","(0.999, 2.0]","(4.0, 9.0]","(2.0, 99.0]",F,Y,N,Employed,Retail&Service,Y,Not_student,House,"(2.0, 3.0]",HBW,Peak,Drive
1,"(41.0, 47.0]","(0.999, 2.0]","(4.0, 9.0]","(2.0, 99.0]",F,Y,N,Employed,Retail&Service,Y,Not_student,House,"(2.0, 3.0]",HBW,Peak,Drive
2,"(10.999, 19.0]","(3.0, 4.0]","(4.0, 9.0]","(2.0, 99.0]",F,Y,Other_agency,Employed,Retail&Service,Y,Student,House,"(11.0, 16.0]",HBW,Off_peak,Drive
3,"(10.999, 19.0]","(3.0, 4.0]","(4.0, 9.0]","(2.0, 99.0]",F,Y,Other_agency,Employed,Retail&Service,Y,Student,House,"(11.0, 16.0]",HBW,Peak,Drive
4,"(10.999, 19.0]","(3.0, 4.0]","(4.0, 9.0]","(2.0, 99.0]",F,Y,Other_agency,Employed,Retail&Service,Y,Student,House,"(2.0, 3.0]",HBW,Peak,Drive


In [89]:
# Import the OneHotEncoder Class to encode the nominal encoder using the dummy variable method
from sklearn.preprocessing import OneHotEncoder

In [90]:
#dummy encode all the categorical variabels
nominal_encoder = OneHotEncoder(sparse=False)
# transform data
#Iterate through the nominal feature dataframe
for i in range(df_filled.shape[1]):
    #Encode the feature in the current iteration cycle 
    onehot = nominal_encoder.fit_transform(df_filled.iloc[:, i].to_numpy().reshape(-1, 1))
    #Save the current feature column
    nominal_col = df_filled.iloc[:, i]
    #If first feature create a variable which will be added on when i != 0
    if i == 0:
        onehot_df = pd.DataFrame(data = onehot, columns = [i for i in nominal_col.unique()[:]] )
    else:
        onehot_df2 = pd.DataFrame(data = onehot, columns = [i for i in nominal_col.unique()[:]] )
        #Merge the df in this iteration cycle with the one formed in the zeroeth cycle (and concacted in the subsequent cycles)
        onehot_df = pd.merge(onehot_df, onehot_df2, how='inner', left_index=True, right_index = True)
#Display the final encoded dataframe of the nominal dataframe
onehot_df

Unnamed: 0,"(41.0, 47.0]","(10.999, 19.0]","(72.0, 99.0]","(58.0, 65.0]","(65.0, 72.0]","(53.0, 58.0]","(19.0, 28.0]","(47.0, 53.0]",None_age,"(35.0, 41.0]","(28.0, 35.0]","(0.999, 2.0]","(3.0, 4.0]_x","(2.0, 3.0]_x","(6.0, 18.0]","(4.0, 6.0]",None_n_pers_trip,"(4.0, 9.0]","(1.0, 2.0]_x","(2.0, 3.0]_y","(0.99, 1.0]",None_hh_size,"(3.0, 4.0]_y","(2.0, 99.0]","(0.0, 1.0]","(1.0, 2.0]_y",None_hh_n_vehs,"(-0.01, 0.0]",F,M,None_sex,Y_x,N_x,None_driver_lic,N_y,Other_agency,None_tran_pass,Y_y,Employed,Not_employed_x,None_emp_stat,Work_at_Home,Retail&Service,Not_employed_y,General_Office,Manufacturing,None_occupation,Y,Not Applicable,N,None_free_park,Not_student,Student,None_stu_stat,House,Townhouse,Apartment,None_hh_dwell_type,"(2.0, 3.0]","(11.0, 16.0]","(5.0, 6.0]","(4.0, 5.0]","(3.0, 4.0]","(6.0, 8.0]","(-0.001, 1.0]","(1.0, 2.0]","(16.0, 56.0]","(8.0, 11.0]",HBW,HBD,Non-HB,Peak,Off_peak,Drive,Other,Passenger,Transit,Walk,Bicycle
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
45204,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
45205,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
45206,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [92]:
"""X = onehot_df
y = income_missing['NotMissing']

from imblearn.over_sampling import SMOTE
os = SMOTE(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
y_train = y_train.astype('int')

columns = X_train.columns
os_data_X, os_data_y = os.fit_sample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
os_data_y= pd.DataFrame(data=os_data_y,columns=['NotMissing'])
# we can Check the numbers of our data
print("length of oversampled data is ",len(os_data_X))
print("Number of missing income in oversampled data",len(os_data_y[os_data_y['NotMissing']==0]))
print("Number of income",len(os_data_y[os_data_y['NotMissing']==1]))
print("Proportion of missing income data in oversampled data is ",len(os_data_y[os_data_y['NotMissing']==0])/len(os_data_X))
print("Proportion of income data in oversampled data is ",len(os_data_y[os_data_y['NotMissing']==1])/len(os_data_X))"""

'X = onehot_df\ny = income_missing[\'NotMissing\']\n\nfrom imblearn.over_sampling import SMOTE\nos = SMOTE(random_state=0)\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)\ny_train = y_train.astype(\'int\')\n\ncolumns = X_train.columns\nos_data_X, os_data_y = os.fit_sample(X_train, y_train)\nos_data_X = pd.DataFrame(data=os_data_X,columns=columns )\nos_data_y= pd.DataFrame(data=os_data_y,columns=[\'NotMissing\'])\n# we can Check the numbers of our data\nprint("length of oversampled data is ",len(os_data_X))\nprint("Number of missing income in oversampled data",len(os_data_y[os_data_y[\'NotMissing\']==0]))\nprint("Number of income",len(os_data_y[os_data_y[\'NotMissing\']==1]))\nprint("Proportion of missing income data in oversampled data is ",len(os_data_y[os_data_y[\'NotMissing\']==0])/len(os_data_X))\nprint("Proportion of income data in oversampled data is ",len(os_data_y[os_data_y[\'NotMissing\']==1])/len(os_data_X))'

In [93]:
#assingn the independent variables(X) and dependent variables(Y) dataset for machine learning
os_data_X = onehot_df
os_data_y = income_missing
os_data_y = os_data_y.astype("int")

In [55]:
#store all the column names from the onehot_df dataframe
onehot_df_vars=onehot_df.columns.values.tolist()
#y=['NotMissing']
#X=[i for i in onehot_df_vars if i not in y]
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
#setup the logistic regression model
logreg = LogisticRegression()
rfe = RFE(logreg, 20)
#fit it to the dataset
rfe = rfe.fit(os_data_X, os_data_y.values.ravel())
print(rfe.support_)
#see the ranking of the coeffiecents
print(rfe.ranking_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

[ True  True  True  True  True  True False  True False  True  True  True
 False  True False False  True False False False  True False False  True
  True  True  True False  True  True False False False False False  True
 False False False False False False False False False False]
[ 1  1  1  1  1  1  3  1  4  1  1  1  5  1 12  2  1  7 13  8  1  9 21  1
  1  1  1 11  1  1 22  6 20 16 19  1 26 10 25 14 27 23 24 17 15 18]


In [57]:
cols = []
for i in range(onehot_df.shape[1]):
    if rfe.ranking_[i] == 1:
        print("{}".format(onehot_df.columns[i]))
        cols.append(onehot_df.columns[i])
cols

F
M
None_sex
Y_x
N_x
None_driver_lic
Other_agency
Y_y
Employed
Not_employed_x
Work_at_Home
General_Office
Not Applicable
Not_student
Student
None_stu_stat
House
Apartment
None_hh_dwell_type
Drive


['F',
 'M',
 'None_sex',
 'Y_x',
 'N_x',
 'None_driver_lic',
 'Other_agency',
 'Y_y',
 'Employed',
 'Not_employed_x',
 'Work_at_Home',
 'General_Office',
 'Not Applicable',
 'Not_student',
 'Student',
 'None_stu_stat',
 'House',
 'Apartment',
 'None_hh_dwell_type',
 'Drive']

In [95]:
#cols = [i for i in onehot_df]
X = os_data_X[:]
y = os_data_y['NotMissing']

In [96]:
from sklearn.feature_selection import chi2
scores, pvalues = chi2(X, y)
pvalues

array([1.30423493e-01, 1.35481537e-10, 3.17332411e-40, 8.38677825e-13,
       1.98670114e-06, 7.79783612e-01, 7.99733568e-01, 5.70702565e-02,
       5.15252158e-04, 6.80072368e-14, 0.00000000e+00, 3.58696906e-04,
       1.17938010e-02, 4.06573894e-13, 1.54461213e-02, 1.12870091e-08,
       0.00000000e+00, 9.71983164e-08, 8.47673213e-03, 8.25738264e-03,
       9.98782991e-06, 2.22850601e-04, 0.00000000e+00, 9.78825722e-01,
       9.67979770e-10, 4.62810186e-14, 9.16037888e-14, 0.00000000e+00,
       3.37906742e-01, 1.06506673e-11, 0.00000000e+00, 3.70405244e-01,
       0.00000000e+00, 1.09553484e-10, 6.60556717e-09, 0.00000000e+00,
       4.98965544e-02, 7.55542089e-05, 1.32106065e-40, 0.00000000e+00,
       1.53290296e-16, 2.17604837e-01, 7.53522054e-37, 7.56548949e-01,
       0.00000000e+00, 1.30517938e-16, 2.10858720e-11, 1.12032429e-20,
       7.97210325e-82, 1.30517938e-16, 1.90821026e-30, 0.00000000e+00,
       2.11872107e-07, 1.32178210e-01, 9.91674807e-13, 8.69169088e-01,
      

In [97]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression()

In [98]:
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.84


In [99]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[  170  2161]
 [    6 11226]]


In [100]:
coeff = logreg.coef_[0]
print(coeff)
for i in range(len(coeff)-1):
    print("{} : {}".format(os_data_X.columns[i], coeff[i]))
    
#print(logreg.coef_, logreg.intercept_)

[ 0.01338527  0.31872917  0.54638177  0.20268869  0.11532085 -0.09466694
 -0.05569512 -0.13792641 -0.16575397 -0.24942434 -0.54306301 -0.07666695
  0.11092133  0.19362499  0.03685283  0.22830676 -0.54306301  0.22460552
  0.15534592  0.10737609  0.08134687 -0.07563543 -0.54306301  0.0865348
  0.24207071  0.24740229 -0.08296883 -0.54306301  0.17833611  0.31470287
 -0.54306301  0.4696227  -0.88884706  0.36920032  0.36684511 -0.36834326
  0.20819739 -0.25672328  0.22767745 -0.65158065  0.09922008  0.27465908
  0.15136834 -0.01024143 -0.39685    -0.00929756  0.2149966   0.19588773
 -0.25438182 -0.00929756  0.01776762 -0.54306301  0.15020894  0.34283004
  0.22077952 -0.02740375 -0.54306301  0.29966319 -0.00325017 -0.03244052
  0.03766692 -0.04189898  0.06258203 -0.02953911  0.03407289  0.01481397
 -0.05089557 -0.04113548 -0.00628413 -0.02861029 -0.01512961 -0.03319507
 -0.01682896  0.4692104  -0.01670691 -0.25300164 -0.03129406 -0.16111468
 -0.05711714]
(41.0, 47.0] : 0.01338527205208861
(10

# Continuous Results:

F : 0.2789963176521173
M : 0.44555091444511286
None_sex : -0.5025844113602141
Y_x : 0.3504896321170755
N_x : -0.6080440664814205
None_driver_lic : 0.479517255101379
N_y : 0.5454766528807342
Other_agency : -0.5353101666572936
None_tran_pass : 0.2983037219150178
Y_y : -0.08650738740142969
Employed : 0.3214096906472166
Not_employed_x : -0.5104551260336191
None_emp_stat : 0.10557773299971222
Work_at_Home : 0.3054305231236449
Retail&Service : 0.24666572849822002
Not_employed_y : 0.036628510902472315
General_Office : -0.496364034096602
Manufacturing : 0.09770701832633422
None_occupation : 0.3373255971065975
Y : 0.3231303744156547
Not Applicable : -0.23428840788879887
N : 0.09770701832633422
None_free_park : 0.0354138358838324
Not_student : -0.5025844113602141
Student : 0.41499455009804775
None_stu_stat : 0.30955268199917424
House : 0.2779305039818798
Townhouse : 0.05251256000111766
Apartment : -0.5025844113602141
None_hh_dwell_type : 0.3941041681142273
HBW : 0.10426692032068303
HBD : -0.040024594822206244
Non-HB : 0.1577204952385441
Peak : 0.08809661068680705
Off_peak : 0.1338662100502224
Drive : 0.17670831301523843
Other : 0.06088650526960322
Passenger : -0.11559754413535446
Transit : 0.09515277142467209
Walk : -0.006740862854613231
Bicycle : 0.011553638017458503
age : -0.01589281895222505
trip_man_km : -0.0017979047926717532
hh_n_vehs : -0.06068045829958429
hh_size : -0.10165372758683203

In [101]:
from sklearn.metrics import recall_score
recall_score(y_test, y_pred)

0.999465811965812

In [102]:
from sklearn.metrics import f1_score

f1_score(y_test, y_pred)

0.9119785531500061

In [103]:
y_test.value_counts()

1    11232
0     2331
Name: NotMissing, dtype: int64