In [1]:
import os
import pickle
import pandas as pd
import time
import matplotlib.pyplot as plt
  
from sklearn.model_selection import train_test_split
from pandas_profiling import ProfileReport

from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.metrics import plot_confusion_matrix, recall_score,\
    accuracy_score, precision_score, f1_score, classification_report, ConfusionMatrixDisplay, confusion_matrix
import xgboost

In [2]:
from src.pandas_helper import colInfo
from src.pandas_helper import modelReport
from src.pandas_helper import getAllCrossValScores
from src.pandas_helper import prettyPrintGridCVResults


from src.sklearn_helper import CoordinatesImputer

# Helper Functions

# Load Data

In [3]:
X = pd.read_csv('./data/Training_set.csv')
y = pd.read_csv('./data/Training_labels.csv').status_group
Test_Values = pd.read_csv('./data/Test_set.csv')
X_train,X_test, y_train, y_test = train_test_split(X,y,random_state=42)

In [None]:
for pickle_file in os.listdir('./models/'):
    print(pickle_file)

In [None]:
titles={'baseline_00.sav':'Dummy Classifier',
'knn_gs1.sav':'KNN Classifier - Hypertune',
'knn_model_pipe.sav':'KNN Classifier - Simple',
'log_reg_gs1.sav':'Logistic Regression Classifier - Hypertuned',
'log_reg_pipe.sav':'Logistic Regression Classifier - Simple',
'rfc_gs1.sav':'Random Forest Classifier - Hypertuned',
'rfc_imb_gs1.sav':'Random Forest Classifier - Hypertuned + SMOTE',
'rfc_model_pipe.sav':'Random Forest Classifier - Simple',
'xgbrf_gs1.sav':'XGBoost - Hypertune 1',
'xgbrf_gs2.sav':'XGBoost - Hypertune 2',
'xgbrf_gs3.sav':'XGBoost - Hypertune 3',
'xgbrf_gs4.sav':'XGBoost - Hypertune 4',
'xgbrf_imb_gs1.sav':'XGBRF Classifier - Hypertune + SMOTE 1',
'xgbrf_imb_gs2.sav':'XGBRF Classifier - Hypertune + SMOTE 2',
'XGBRF_model_pipe.sav':'XGBRF Classifier - Simple'}

models={}

for pickle_file in os.listdir('./models/'):
    print(pickle_file)
    models[titles[str(pickle_file)]] = pickle.load(open(f'./models/{pickle_file}', 'rb'))

In [None]:
for title,estimator in models.items():
    fig, ax = plt.subplots(figsize=(10,10))
    
    try:
        mod = estimator.best_estimator_
    except:
        mod = estimator
    
    disp = ConfusionMatrixDisplay.from_estimator(
        mod,
        X_test,
        y_test,
        display_labels=mod.classes_,
        cmap='OrRd',
        normalize='true',
        ax=ax
       ).ax_.set_title(title)
        
    plt.savefig(f'images/normalized_confusion_matrices/{title}.jpg',
                bbox_inches ='tight',
                transparent = False
       )

    plt.clf()
    


In [None]:
for title,estimator in models.items():
    fig, ax = plt.subplots(figsize=(10,10))
    
    try:
        mod = estimator.best_estimator_
    except:
        mod = estimator
        
    disp = ConfusionMatrixDisplay.from_estimator(
        mod,
        X_test,
        y_test,
        display_labels=mod.classes_,
        cmap='OrRd',
        ax=ax
       ).ax_.set_title(title)
        
    plt.savefig(f'images/confusion_matrices/{title}.jpg',
                bbox_inches ='tight',
                transparent = False
       )
    plt.clf()

In [None]:
#Impute nans as well as 'unknown's
subpipe_cat1 = Pipeline(steps=[
    ('cat1_imp1', SimpleImputer(missing_values=np.nan,strategy='most_frequent')),
    ('cat1_imp2', SimpleImputer(missing_values='unknown',strategy='most_frequent')),
    ('cat1_ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

#Only OneHotEncode
subpipe_cat2 = Pipeline(steps=[
    ('cat2_ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

#Impute median and scale (0,1)
subpipe_num = Pipeline(steps=[
    ('num_imp', SimpleImputer(missing_values=np.nan,strategy='median')),
    ('num_sc', MinMaxScaler()),    
])


#Specifically designed for imputing erroneous coordinates
#As part of your column transformations,an additional column (default='region') will be needed to be passed
#Returning dataframe excludes this additional column
#Impute (0,0) coordinates from the corresponding regions mean/median  and scale (0,1)
subpipe_coord = Pipeline(steps=[
    ('coord_imp', CoordinatesImputer(groupByTarget='region',metric='median')), 
    ('coord_sc', MinMaxScaler()),    
])


ct = ColumnTransformer(transformers=[

    ('ct_cyr', subpipe_num, ['construction_year']),
    
    # Note: region is passed to allow groupby. Not returned.
    ('ct_coord', subpipe_coord, ['latitude', 'longitude', 'region']),
    
    ('ct_cat1', subpipe_cat1, ['public_meeting','scheme_management', 'permit', 'source_class']),
    
    ('ct_cat2', subpipe_cat2, ['basin', 'extraction_type_class', 'management', 'payment', 'quality_group',
                               'quantity', 'source_type', 'waterpoint_type_group']),

])

In [4]:
final_model =  pickle.load(open(f'models/xgbrf_gs4.sav', 'rb')).best_estimator_

In [5]:
final_model

Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('ct_cyr',
                                                  Pipeline(steps=[('num_imp',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('num_sc',
                                                                   MinMaxScaler())]),
                                                  ['construction_year']),
                                                 ('ct_coord',
                                                  Pipeline(steps=[('coord_imp',
                                                                   CoordinatesImputer()),
                                                                  ('coord_sc',
                                                                   MinMaxScaler())]),
                                                  ['latitude', 'longitude',
                           

In [6]:
final_model.fit(X,y)

Parameters: { criterion } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('ct_cyr',
                                                  Pipeline(steps=[('num_imp',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('num_sc',
                                                                   MinMaxScaler())]),
                                                  ['construction_year']),
                                                 ('ct_coord',
                                                  Pipeline(steps=[('coord_imp',
                                                                   CoordinatesImputer()),
                                                                  ('coord_sc',
                                                                   MinMaxScaler())]),
                                                  ['latitude', 'longitude',
                           

In [25]:
final_model.fit(X_train,y_train)

Parameters: { criterion } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('ct_cyr',
                                                  Pipeline(steps=[('num_imp',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('num_sc',
                                                                   MinMaxScaler())]),
                                                  ['construction_year']),
                                                 ('ct_coord',
                                                  Pipeline(steps=[('coord_imp',
                                                                   CoordinatesImputer()),
                                                                  ('coord_sc',
                                                                   MinMaxScaler())]),
                                                  ['latitude', 'longitude',
                           

In [39]:
y_pred = final_model.predict(X_test)

In [40]:
y_pred

array(['non functional', 'functional', 'functional', ...,
       'functional needs repair', 'functional', 'non functional'],
      dtype=object)

In [95]:
y_test

2980              non functional
5246                  functional
22659                 functional
39888             non functional
13361                 functional
                  ...           
20338                 functional
34426             non functional
23976    functional needs repair
39317             non functional
1888              non functional
Name: status_group, Length: 14850, dtype: object

In [135]:
df_temp = X_test.join(y_pred==y_test)[['region','status_group']].rename(columns = {'status_group':'regional_accuracy'})
df_viz = (df_temp.groupby('region').sum() / df_temp.groupby('region').count()).reset_index()
df_viz.region = df_viz.region.replace('Dar es Salaam','Dar-Es-Salaam')
df_viz

Unnamed: 0,region,regional_accuracy
0,Arusha,0.764777
1,Dar-Es-Salaam,0.878947
2,Dodoma,0.808394
3,Iringa,0.9033
4,Kagera,0.773221
5,Kigoma,0.67688
6,Kilimanjaro,0.769772
7,Lindi,0.8
8,Manyara,0.784416
9,Mara,0.791322


In [141]:
import folium
Tanzania_coord = (-6.3690,34.8888) # The coordinates of Tanzania per Google maps

m=folium.Map(location=Tanzania_coord, zoom_start=6, tiles='CartoDB positron')

with open('./data/stanford-tn398yw9512-geojson_districtBoundary.json', 'r',encoding='utf8') as f:
    geoJSON_raw = json.load(f)

geo_j = folium.GeoJson(data=geoJSON_raw)
geo_j.add_to(m)


# Map Details
folium.Choropleth(
    geo_data=geoJSON_raw,
    name="choropleth",
    data=df_viz,
    columns=["region", "regional_accuracy"],
    key_on="feature.properties.adm1",
#     fill_color="YlOrRd",
    fill_opacity=0.8,
    line_opacity=0.33,
    legend_name="House Sales",
    highlight=True,
).add_to(m)


# Save the map to an HTML file
m.save('./maps/Accuracy Choropleth.html')

m

In [151]:
reg_list_geo = []
for each in geoJSON_raw['features']:
    reg_list_geo.append(each['properties']['adm2'])

In [152]:
len(list(set(reg_list_geo)))

111

# For submissions

In [None]:
Test_predictions = final_model.predict(Test_Values)

In [None]:
Test_predictions

In [None]:
pd.DataFrame(Test_predictions, columns =['status_group'])

In [None]:
submission = pd.DataFrame(Test_Values.id).join(pd.DataFrame(Test_predictions, columns =['status_group']))


In [None]:
# Timestamp in seconds
ts = int(time.time())

submission.to_csv(f'data/SaadSaeed_{ts}.csv',index=False)

In [None]:
Tanzania_coord = (-6.3690,34.8888)
import folium

In [None]:
m=folium.Map(location=Tanzania_coord, zoom_start=4, tiles='CartoDB dark_matter')

with open('../data/Tanzania_riverways.json', 'r',encoding='utf8') as f:
    geoJSON_raw = json.load(f)

# geo_j = folium.GeoJson(data=geoJSON_raw,
#                            style_function=lambda x: {'fillColor': 'orange'})
# geo_j.add_to(m)


# folium.Marker(
#     location=(0,-2e-08),
#     icon=folium.Icon(color="blue",icon="tint", prefix='fa')
# ).add_to(m)


#Save the map to an HTML file
m.save('../maps/Riverways.html')

m

In [None]:
X = pd.read_csv('data/Training_set.csv')
y = pd.read_csv('data/Training_labels.csv').status_group