In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report, recall_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.feature_selection import SelectKBest, chi2, f_regression, f_classif
from sklearn.preprocessing import RobustScaler, OneHotEncoder
import csv
import warnings
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.cluster import hierarchy as hc
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.display.max_rows = 9999


amount_tsh : Total static head (amount water available to waterpoint)
date_recorded : The date the row was entered
funder : Who funded the well
gps_height : Altitude of the well
installer : Organization that installed the well
longitude : GPS coordinate
latitude : GPS coordinate
wpt_name : Name of the waterpoint if there is one
num_private :
basin : Geographic water basin
subvillage : Geographic location
region : Geographic location
region_code : Geographic location (coded)
district_code : Geographic location (coded)
lga : Geographic location
ward : Geographic location
population : Population around the well
public_meeting : True/False
recorded_by : Group entering this row of data
scheme_management : Who operates the waterpoint
scheme_name : Who operates the waterpoint
permit : If the waterpoint is permitted
construction_year : Year the waterpoint was constructed
extraction_type : The kind of extraction the waterpoint uses
extraction_type_group : The kind of extraction the waterpoint uses
extraction_type_class : The kind of extraction the waterpoint uses
management : How the waterpoint is managed
management_group : How the waterpoint is managed
payment : What the water costs
payment_type : What the water costs
water_quality : The quality of the water
quality_group : The quality of the water
quantity : The quantity of water
quantity_group : The quantity of water
source : The source of the water
source_type : The source of the water
source_class : The source of the water
waterpoint_type : The kind of waterpoint
waterpoint_type_group : The kind of waterpoint

In [2]:
df_X = pd.read_csv('tanz_train_features.csv')
df_y = pd.read_csv('tanz_train_labels.csv')
Test_X = pd.read_csv('tanz_test_features.csv')
submit = pd.read_csv('tanz_sample_submission.csv')
print(df_X.shape, df_y.shape)
print(df_X.dtypes)
print(df_y.dtypes)

(59400, 40) (59400, 2)
id                         int64
amount_tsh               float64
date_recorded             object
funder                    object
gps_height                 int64
installer                 object
longitude                float64
latitude                 float64
wpt_name                  object
num_private                int64
basin                     object
subvillage                object
region                    object
region_code                int64
district_code              int64
lga                       object
ward                      object
population                 int64
public_meeting            object
recorded_by               object
scheme_management         object
scheme_name               object
permit                    object
construction_year          int64
extraction_type           object
extraction_type_group     object
extraction_type_class     object
management                object
management_group          object
payment             

In [3]:
pd.value_counts(df_y['status_group'])

functional                 32259
non functional             22824
functional needs repair     4317
Name: status_group, dtype: int64

In [4]:
# Not going to use... lowered score by .025

cleaning_dict =         {
                           'ADP Busangi':'ADP',
                           'ADRA':'ADRA /Government',
                           'AIC':'AICT',
                           'AMP Contracts':'AMP Contract', 
                           'AMP contractor':'AMP Contract',
                           'Active KMK':'Active MKM',
                           'Adra /Community':'Adra',
                           'Adra/ Community':'Adra',
                          'Adra/Community':'Adra',
                           'Angli':'Anglikana',
                           'Atisan': 'Artisan',
                           'COMMU':'COMMUNITY', 
                           'Cebtral Government':'CENTRAL GOVERNMENT',
                          'Centr':'CENTRAL GOVERNMENT',
                          'Central Government':'CENTRAL GOVERNMENT',
                          'Central government':'CENTRAL GOVERNMENT',
                          'Central govt':'CENTRAL GOVERNMENT',
                          'Commu':'Community',
                          'Communit':'Community',
                          'Conce': 'Concern',
                          'Cons':'Consultant',
                          'Consultant Engineer':'Consultant',
                          'Consultant and DWE':'Consultant',
                          'Consulting Engineer':'Consultant',
                          'Consulting engineer':'Consultant',
                          'Consuting Engineer':'Consultant',
                          'Cosmo':'Cosmos Engineering',
                           'Counc':'Council',
                           'DANID':'DANIDA',
                           'DISTRICT COUNCIL':'District Council',
                           'DMDD/SOLIDER':'DMDD',
                           'DWE/':'DWE',
                           'DWE}':'DWE',
                           'Distri':'District  Council',
                           'District COUNCIL':'District  Council',
                           'District Counci':'District  Council',
                           'District Council':'District  Council',
                           'District council':'District  Council',
                           'District water depar':'District Water Department',
                           'District water department':'District Water Department',
                           'Dr. Matomola':'Dr. Matobola',
                           'Dr.Matobola':'Dr. Matobola',
                           'Dr.Matomola':'Dr. Matobola',
                           'FINI WATER':'FIN WATER',
                           'FINLAND':'FIN WATER',
                           'FINN WATER':'FIN WATER',
                           'FiNI WATER':'FIN WATER',
                           'FinW':'FIN WATER',
                           'FinWater':'FIN WATER',
                           'Fini Water':'FIN WATER',
                           'Fini water':'FIN WATER',
                           'Finland Government':'FIN WATER',
                           'Finwater':'FIN WATER',
                           'GOVER':'GOVERNMENT',
                           'GOVERN':'GOVERNMENT',
                           'Gove':'GOVERNMENT',
                           'Gover':'GOVERNMENT',
                           'central government':'GOVERNMENT',
                           'Government':'GOVERNMENT',
                           'Tanz':'Tanzania Government',    
                           'Tanza':'Tanzania Government',    
                           'Tanzania':'Tanzania Government',    
                           'Goldstar':'Gold star',
                           'Gwasco L':'Gwasco',
                           'Gwaseco':'Gwasco',
                           'Halmashauri':'Halmashauli',
                           'Halmashauri ya manispa tabora':'Halmashauli',
                           'Halmashauri ya wilaya':'Halmashauli',
                           'Halmashauri ya wilaya sikonge':'Halmashauli',
                           'Handeni Trunk Main(':'Handeni Trunk Main',
                           'Hesewa':'Hesawa',    
                           'ISF/Government':'ISF',    
                           'ISF/TACARE':'ISF',    
                           'Individuals':'Individual',    
                           'JAICA CO':'JAICA',    
                           'JANDU PLUMBER CO':'JANDU PLUMBER  CO',    
                           'Japan Government':'JAPAN EMBASSY',    
                           'Jeica':'Jaica',     
                           'KKKT CHURCH':'KKKT',     
                           'KKKT _ Konde and DWE':'KKKT', 
                           'KKKT-Dioces ya Pare':'KKKT', 
                           'KKT':'KKKT',     
                           'KKT C':'KKKT', 
                           'KkKT':'KKKT', 
                           'Kkkt':'KKKT',    
                           'Karugendo':'Kalugendo',    
                           'Kuwait':'KUWAIT',
                           'kuwait':'KUWAIT',
                           'Kilwa company':'Kiliwater',
                           'Kilwater':'Kiliwater',   
                           'Kiliwater r':'Kiliwater',    
                           'Killflora/ Community':'Killflora /Community',    
                           'LOCAL CONTRACT':'Local',    
                           'Local  technician':'Local',    
                           'Local l technician':'Local',    
                           'Local te':'Local',
                           'Local technical':'Local',    
                           'Local technical tec':'Local',    
                           'Local technician':'Local',     
                           'Lawatefuka water sup':'Lawate fuka water su',     
                           'MASWI DRILLING':'MASWI',    
                           'MDRD':'MDRDP', 
                           'MINISTRYOF WATER':'MINISTRY OF WATER',
                           'Ministry of water':'MINISTRY OF WATER',
                           'Ministry of water engineer':'MINISTRY OF WATER',    
                           'MLAKI CO':'MLAKI  CO',    
                           'MTUWASA and Community':'MTUWASA',    
                           'MUWSA':'MUWASA',    
                           'MWE &':'MWE',    
                           'Mackd':'MACK DONALD CONTRACTOR',     
                           'Maji block':'Maji Tech',    
                           'Makonde water population':'Makonde',     
                           'Maswi company':'Maswi',    
                           'Mileniam':'Milenia',    
                           'Mileniam project':'Milenia',    
                           'Milenium':'Milenia',    
                           'Mission':'Missi',    
                           'Morrovian':'Morovian', 
                           'Mosque':'MOSQUE',
                           'Norad':'NORAD',    
                           'NORAD/':'NORAD',    
                           'NYAKILANGANI CO':'NYAKILANGANI  CO',    
                           'NYAKILANGANI CONSTRUCTION':'NYAKILANGANI  CO',    
                           'Naishu construction co. ltd':'Naishu Construction Co. ltd',    
                           'not known':'Unknown',
                           'Not kno':'Unknown',    
                           'Not known':'Unknown',
                           'not known':'Unknown',
                           'Unknown':'Unknown',
                           'OXFARM':'OXFAM',    
                           'Oikos E. Africa':'Oikos E .Africa',    
                           'Oikos E.Afrika':'Oikos E .Africa',    
                           'Olgilai village community':'Oldadai village community',    
                           'Padep':'PADEP',
                           'Plan Internationa':'Plan Int',    
                           'Pr':'Private',    
                           'Priva':'Private',    
                           'Privat':'Private',    
                           'Private Technician':'Private',    
                           'QWICKWIN':'QUWKWIN',
                           'RC':'Roman Catholic',   
                           'RC C':'Roman Catholic',    
                           'RC CATHORIC':'Roman Catholic',    
                           'RC CHURCH':'Roman Catholic',     
                           'RC Ch':'Roman Catholic',    
                           'RC Churc':'Roman Catholic',    
                           'RC Church':'Roman Catholic',    
                           'RC church':'Roman Catholic',
                           'Romam':'Roman Catholic',    
                           'Roman':'Roman Catholic',    
                           'Roman Ca':'Roman Catholic',    
                           'Roman Cathoric -Same':'Roman Catholic',    
                           'Roman Cathoric Same':'Roman Catholic',    
                           'Roman Church':'Roman Catholic',    
                           'Roman catholic':'Roman Catholic',
                           'rc ch':'Roman Catholic',    
                           'RUDEP/':'RUDE',      
                           'RWE /Community':'RWE',    
                           'RWE Community':'RWE',    
                           'RWE/ Community':'RWE',       
                           'RWSSP':'RWSP',    
                           'Regional Water':'Region water Department',       
                           'Rotery c':'Rotary club',    
                           'SAXON BUILDING CONTRACTOR':'SAXON',    
                           'SEMA':'SEMA CO LTD',    
                           'SERENS':'SERENA',    
                           'Shipo':'SHIPO',    
                           'Save the rain USA':'Save the rain',      
                           'School Adminstrarion':'School',    
                           'Serikali ya kijiji':'Serikali',       
                           'TASAF/DMDD':'TASAF',    
                           'TASSAF /TCRS':'TASSAF',    
                           'TASSAF/ TCRS':'TASSAF',     
                           'TCRS /CARE':'TCRS',    
                           'TCRS /Government':'TCRS',    
                           'TCRS /TWESA':'TCRS',   
                           'TCRS a':'TCRS',    
                           'TCRS/village community':'TCRS',    
                           'TLC/community':'TLC',    
                           'TRIDEP':'TREDEP',     
                           'TWESA/ Community':'TWESA',    
                           'The I':'The Isla',
                           'Total landcare':'Total land care',     
                           'Totoland':'Total land care',     
                           'US Embassy':'U.S.A',    
                           'USA EMBASSY':'U.S.A',    
                           'UDC/Sema':'U.S.A',    
                           'UDC/sema':'U.S.A',    
                           'UMOJA DRILLING CONTRUCTO':'UMOJA DRILLING CONSTRUCTION',    
                           'UN ONE':'UN',    
                           'Unicef':'UN',    
                           'Unisef':'UN',
                           'VILLAG':'VILLAGERS',    
                           'VILLAGE COUNCIL':'VILLAGERS',    
                           'villagers':'VILLAGERS',    
                           'VTECOS':'VITECOS',    
                           'Vill':'VILLAGERS',    
                           'Villa':'VILLAGERS',    
                           'Village':'VILLAGERS',   
                           'Village  Council':'VILLAGERS',    
                           'Village Community':'VILLAGERS',    
                           'Village Council':'VILLAGERS',    
                           'Village Counil':'VILLAGERS',    
                           'Village Government':'VILLAGERS',    
                           'Village community':'VILLAGERS',     
                           'Village council':'VILLAGERS',   
                           'Village govt':'VILLAGERS',    
                           'Villagers':'VILLAGERS',    
                           'W/':'W',     
                           'WATER AID':'WATER  AID',    
                           'WATERAID':'WATER  AID', 
                           'WD and ID':'W.D &',     
                           'WEDECO/WESSONS':'WEDECO',
                           'Wedeco':'WEDECO',    
                           'WOULD BANK':'WORLD BANK',
                           'World Bank':'WORLD BANK',    
                           'World bank':'WORLD BANK',     
                           'WWF/':'WWF',    
                           'Water /sema':'Water  Aid/Sema',    
                           'Water Aid/Sema':'Water  Aid/Sema',   
                           'Water Aid /sema':'Water  Aid/Sema',    
                           'Water Aid/Sema':'Water  Aid/Sema',    
                           'Water aid /sema':'Water  Aid/Sema',    
                           'Water aid/sema':'Water  Aid/Sema',    
                           'World Vission':'World Vision',    
                           'World vision':'World Vision',    
                           'wanan':'wananchi',
                           'Bagamoyo wate':'Bagamoyo Wate',    
                           'Borehole':'Bore hole',
                           'Busunzu /Nyankwi gravity water supply':'Busunzu /Nyankwi gravity water',    
                           'Busunzu/ Nyankwi gravity water':'Busunzu /Nyankwi gravity water',    
                           'Busunzu/ Nyankwi gravity water supply':'Busunzu /Nyankwi gravity water',    
                           'Chanjare water supply':'Chanjare  water supply',     
                           "Gwang'a chome water supply":"Gwang'a Chome water supply",    
                           'Handeni Trunk Main(H':'Handeni Trunk Main (',    
                           'Handeni water suply':'Handeni Trunk Main (',    
                           'Handeni water supply':'Handeni Trunk Main (',    
                           'Igoma Water Supply':'Igoma',
                           'Ilolangulu water supply':'Ilolangulu water  supply',                   
                           'Imbaseny pumping wate supply':'Imbaseny pumping wate Supply',
                           'Imbaseny pumping water supply':'Imbaseny pumping wate Supply',                   
                           'Jumuhiya ya watumia maji':'Jumuhiya ya watumia  maji',                                  
                           'Kakonko gravity water supply':'Kakonko /Mbizi gravity water supply',                                    
                           'Kalemela ziwani':'Kalemela Ziwani',                                 
                           'Kalenge Water Supply':'Kalenge',                   
                           'Kanga Dam':'Kanga  Dam',                                    
                           'Kidia kilemapunda':'Kidia kIlemapunda',                                  
                           'Kigonigoni Water supply':'Kigonigoni Water Supply',                               
                           'Kilimi and uchama dam':'Kilimi and Uchama dam',                                   
                           'Kimuli water supp':'Kimuli  water sup',                                
                           'Kirua kahe pumping water trust':'Kirua kahe gravity water supply trust',                          
                           'Kisangilo water scheme':'Kisangilo pipelines',                                
                           'Kumsasa spring source':'Kumsasa  spring source',                
                           'Kyamakata gravity water supply':'Kyamakata Pumping water supply',                                  
                           'LAMP water supplying':'LAMP',                                  
                           'Lake victoria pipe scheme':'Lake Victoria pipe scheme',                                     
                           'Libango water use group scheme':'Libango water scheme',                   
                           'Losaa Kia water supply':'Losaa-Kia water supply',     
                           'Lutende':'Lutende Scheme',    
                           'Lyamungo umbwe water supply':'Lyamungo-Umbwe water supply',     
                           'MAKOGA':'MAKOGA WATER SUPPLY',   
                           'Magati water schem':'Magati gravity water',
                           'Maha':'Maha water supply',                 
                           'Mahida-mawanda pipeline':'Mahida mawanda water supply',                   
                           'Maji Coast(Ruvu)':'Maji Coast(Ruvu',                              
                           'Makanya-Tae water supply':'Makanya Tae water supply',                              
                           'Makiidi water supply':'Makidi water supply',                                 
                           'Mashati dispensary water supply':'Mashati Dispensary water supply',                   
                           'Mashati water supply':'Mashati Dispensary water supply',                             
                           'Maswa Water  supply program':'Maswa Water',                   
                           'Maswa Water supply program':'Maswa Water',                   
                           'Maswa water supply program':'Maswa Water',                                   
                           'Matai group Water Supp':'Matai group  Water Sup',                                  
                           'Mazinde ngua water s':'Mazinde ngua water',                   
                           'Mgaraganza water project':'Mgaraganza  water project',                
                           'Mkongoro one':'Mkongoro One',           
                           'Mkongoro one water project':'Mkongoro One',                   
                           'Mtiro pipeline':'Mtiro  pipeline',               
                           'Mwamagembe water supply':'Mwamagembe village water pipe scheme',                   
                           'NYA/MAK/BUK piped scheme':'NYA/ MAK/ BUK piped scheme',                 
                           'Namahimba water gravity scheme':'Namahimba Water gravity scheme',                     
                           'Ngiresi pipe line':'Ngiresi  pipe line',               
                           'Nyakarundi spring source':'Nyakarundi  spring source',            
                           'Nyanzari water spring':'Nyanzari spring source',                 
                           'Nyaruyoba/ Kasaka gravity water':'Nyaruyoba /Kasaka gravity water',   
                           'Nyaruyoba/Kasaka gravity water':'Nyaruyoba /Kasaka gravity water',             
                           'Nyumba ya mungu pipe scheme':'Nyumba ya Mungu water supply',           
                           'Olkokola-mwandet pipe line':'Olkokola mwandet pipe line',                  
                           'QWICKWIN':'QUICK WINGS',           
                           'Quick wins Program':'QUICK WINGS',                 
                           'Rain water':'Rain Water Harvesting',           
                           'Rain water harvesting':'Rain Water Harvesting',                   
                           'Riftvalley project water supply':'Riftvalley Project water supply',                    
                           'water supply &sanitation program':'water supply &sanitation  program',           
                           'water supply&sanitation program':'water supply &sanitation  program',           
                           'Shallow well':'Shallow Well',
                           'shallow well':'Shallow Well',                   
                           'Shirimatunda water Supply':'Shirimatunda Water Supply',                 
                           'Sinyanga water supplied sch':'Sinyanga  water supplied sc',                  
                           'TM lawate water supply':'TM lawate  water supply',                   
                           'TM part Four':'TM part  Four',                 
                           'TM part one water supply':'TM part One',                  
                           'Toloha water supply':'Toloha Water Supply',                   
                           'Tove mtwango':'Tove Mtwango gravity Scheme',           
                           'Tove-mtwango':'Tove Mtwango gravity Scheme',                
                           'Ruvu Ba':'Upper Ruvu Ba',           
                           'upper Ruvu''Upper ruvu':'Upper Ruvu Ba',           
                           'upper ruvu''Upper ruvu':'Upper Ruvu Ba',       
                           "Uroki-Bomang'ombe water sup":"Uroki Bomang'ombe water sup",           
                           'Water Aid':'Water AID',           
                           'World banks':'World Bank',           
                           'World banks Water supplying':'World Bank',           
                           'World banks water supplying':'World Bank',           
                            'imalinyi water supply schem':'imalinyi supply scheme',           
                           "wanging'ombe water supply s":"wanging'ombe supply scheme"           
                          }

In [5]:
df_X = df_X.replace(cleaning_dict)
Test_X = Test_X.replace(cleaning_dict)

In [6]:
df_y.replace(('functional', 'non functional', 'functional needs repair'), (0,1,2), inplace=True)
df_y.drop(columns = 'id', inplace=True)
df_y.head()


Unnamed: 0,status_group
0,0
1,0
2,0
3,1
4,0


In [7]:

X_train, X_test, y_train, y_test = train_test_split(df_X,df_y)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(44550, 40) (14850, 40) (44550, 1) (14850, 1)


In [8]:
majority_class = df_y.mode()['status_group']
y_act = df_y['status_group']
y_pred = np.full(shape = y_act.shape, fill_value = majority_class)
y_act.shape, y_pred.shape



((59400,), (59400,))

In [9]:
accuracy_score(y_act, y_pred)

0.543080808080808

In [10]:
df_X.nunique()

id                       59400
amount_tsh                  98
date_recorded              356
funder                    1875
gps_height                2428
installer                 1922
longitude                57516
latitude                 57517
wpt_name                 37400
num_private                 65
basin                        9
subvillage               19286
region                      21
region_code                 27
district_code               20
lga                        125
ward                      2092
population                1049
public_meeting               2
recorded_by                  1
scheme_management           12
scheme_name               2608
permit                       2
construction_year           55
extraction_type             18
extraction_type_group       13
extraction_type_class        7
management                  12
management_group             5
payment                      7
payment_type                 7
water_quality                8
quality_

In [11]:
df_X_cat = df_X.drop(columns = ['funder', 'id', 'wpt_name', 'date_recorded', 'longitude', 'latitude', 'recorded_by'])
Test_X_cat = Test_X.drop(columns = ['funder', 'id', 'wpt_name', 'date_recorded', 'longitude', 'latitude', 'recorded_by'])
print(df_X_cat.isna().sum())
df_X_cat = df_X_cat.fillna('Unknown')
Test_X_cat = Test_X_cat.fillna('Unknown')
print(df_X_cat.isna().sum())

amount_tsh                   0
gps_height                   0
installer                 3655
num_private                  0
basin                        0
subvillage                 371
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting            3334
scheme_management         3877
scheme_name              28166
permit                    3056
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_group                0
quantity                     0
quantity_group               0
source                       0
source_type                  0
source_class                 0
waterpoint_type              0
waterpoi

In [12]:
bins =[0, 1965, 1970, 1975, 1980, 1985, 1990, 1995, 2000, 2005, 2010,2015 ]
df_X_cat['construction_year'] = pd.cut(df_X_cat['construction_year'], bins,include_lowest =True)
Test_X_cat['construction_year'] = pd.cut(Test_X_cat['construction_year'], bins,include_lowest =True)
Test_X_cat['construction_year'].value_counts()

(-0.001, 1965.0]    5090
(2005.0, 2010.0]    2680
(1995.0, 2000.0]    1303
(2000.0, 2005.0]    1224
(1990.0, 1995.0]     843
(1980.0, 1985.0]     743
(1975.0, 1980.0]     622
(1985.0, 1990.0]     617
(2010.0, 2015.0]     595
(1970.0, 1975.0]     507
(1965.0, 1970.0]     134
Name: construction_year, dtype: int64

In [13]:
df_X_cat.nunique()

amount_tsh                  98
gps_height                2428
installer                 1922
num_private                 65
basin                        9
subvillage               19287
region                      21
region_code                 27
district_code               20
lga                        125
ward                      2092
population                1049
public_meeting               3
scheme_management           13
scheme_name               2608
permit                       3
construction_year           11
extraction_type             18
extraction_type_group       13
extraction_type_class        7
management                  12
management_group             5
payment                      7
payment_type                 7
water_quality                8
quality_group                6
quantity                     5
quantity_group               5
source                      10
source_type                  7
source_class                 3
waterpoint_type              7
waterpoi

In [14]:
df_X_gd = pd.get_dummies(data=df_X_cat)
Test_X_gd = pd.get_dummies(data=Test_X_cat)
print(df_X_gd.shape)
print(Test_X_gd.shape)


(59400, 26226)
(14358, 13044)


In [15]:
drop_col1 = df_X_gd.columns.difference(Test_X_gd.columns).tolist()
drop_col2 = Test_X_gd.columns.difference(df_X_gd.columns).tolist()

df_X_gd = df_X_gd.drop(columns = drop_col1)
Test_X_gd = Test_X_gd.drop(columns = drop_col2)

print(df_X_gd.shape)
print(Test_X_gd.shape)


(59400, 10503)
(14358, 10503)


In [16]:
X1_train, X1_test, y1_train, y1_test = train_test_split(df_X_gd,df_y)
print(X1_train.shape, Test_X_gd.shape, X1_test.shape, y1_train.shape, y1_test.shape)

(44550, 10503) (14358, 10503) (14850, 10503) (44550, 1) (14850, 1)


In [17]:
log_reg = LogisticRegression().fit(X1_train, y1_train.values.ravel())
log_reg.score(X1_train, y1_train)



0.792368125701459

In [18]:
y1_pred = log_reg.predict(X1_test)
accuracy_score(y1_test, y1_pred)

0.7709090909090909

In [19]:
submit['status_group']=log_reg.predict(Test_X_gd)
submit.head()

Unnamed: 0,id,status_group
0,50785,1
1,51630,0
2,17168,1
3,45559,1
4,49871,0


In [20]:
submit.replace((0,1,2),('functional', 'non functional', 'functional needs repair'), inplace=True)
submit.sample(10)

Unnamed: 0,id,status_group
1604,20847,functional
7727,17981,functional
7082,29511,non functional
2247,23414,functional
6690,30607,functional
11857,47768,functional
8494,72898,non functional
3302,50360,functional
12096,40701,functional
2720,57433,functional


In [21]:
#submit.to_csv (r'C:\Users\rick1\Google Drive\Jupyter Notebooks\Project2\logRegSubmission_6.csv', index = False, header=True)

In [26]:
m = RandomForestClassifier(n_estimators=100,min_samples_leaf=3 ,n_jobs=-1,max_features=0.25)
%time m.fit(X1_train, y1_train.values.ravel())
y_pred= m.predict(X1_test)
accuracy_score(y1_test, y_pred)

Wall time: 41min 49s


0.8049158249158249

In [23]:
submit['status_group'] = m.predict(Test_X_gd)
submit.head()

Unnamed: 0,id,status_group
0,50785,1
1,51630,0
2,17168,0
3,45559,1
4,49871,0


In [24]:
submit.replace((0,1,2),('functional', 'non functional', 'functional needs repair'), inplace=True)
submit.sample(10)

Unnamed: 0,id,status_group
12633,46466,non functional
13065,66741,functional
2503,8821,functional
12681,72868,functional
13257,4434,functional
1113,49680,functional
13568,67575,non functional
5677,47582,functional
2850,71768,functional
8137,33760,non functional


In [27]:
submit.to_csv (r'C:\Users\rick1\Google Drive\Jupyter Notebooks\Project2\RandForestSubmission_9.csv', index = False, header=True)