In [262]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

from sklearn.impute import SimpleImputer, KNNImputer

from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from numpy import absolute

from sklearn.feature_selection import RFE

from keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras import Sequential
import keras

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from numpy import mean
from numpy import std

from sklearn.model_selection import GridSearchCV

%matplotlib inline

In [None]:
values = pd.read_csv('Training_values.csv')
labels = pd.read_csv('Training_labels.csv')

In [None]:
df = pd.concat([labels,values], axis=1)

In [None]:
df = df.loc[:,~df.columns.duplicated()]

In [None]:
df.head(10)

In [None]:
df.shape

33 categorical 4 continuous

date_recorded  -  object
funder  -  object
installer  -  object
wpt_name  -  object
num_private  -  int64
basin  -  object
subvillage  -  object
region  -  object
region_code  -  int64
district_code  -  int64
lga  -  object
ward  -  object
public_meeting  -  object
recorded_by  -  object
scheme_management  -  object
scheme_name  -  object
permit  -  object
construction_year  -  int64
extraction_type  -  object
extraction_type_group  -  object
extraction_type_class  -  object
management  -  object
management_group  -  object
payment  -  object
payment_type  -  object
water_quality  -  object
quality_group  -  object
quantity  -  object
quantity_group  -  object
source  -  object
source_type  -  object
source_class  -  object
waterpoint_type  -  object
waterpoint_type_group  -  object
status_group  -  object

id  -  int64
amount_tsh  -  float64
longitude  -  float64
latitude  -  float64
population  -  int64


In [None]:
def preprocessing(df):
    print("Preprocessing Data")
    df = df.drop_duplicates()
    #for column in df.columns:
    #    print(column," - ", df[column].dtype)
    df.isna().sum()
    #df.dropna(inplace=True)
    df.isna().sum()

    drop_columns = ["id", "date_recorded", "funder", "installer", "longitude",
                    "latitude", "wpt_name", "region", "region_code",
                    "district_code", "lga", "ward", "recorded_by",
                    "scheme_management", "scheme_name", "construction_year", "extraction_type",
                    "extraction_type_group", "extraction_type_class", "management", "management_group",
                    "payment", "payment_type", "waterpoint_type", "waterpoint_type_group", "subvillage"]


    df.drop(drop_columns, inplace=True, axis=1)
    df.drop('num_private', inplace=True, axis=1)
    
    #for columns in df.columns:
    #    print(columns, " - ", df[columns].dtype)

    return df

In [None]:
def forward_fill(df):
    print("Using Forward Fill to handle missing data")
    return df.ffill(axis = 0)

In [None]:
def backward_fill(df):
    print("Using Backward Fill to handle missing data")
    return df.bfill(axis=0)

In [None]:
def simpleImputer(df):
    print("Using Simple Imputer to handle missing data")

    imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent', add_indicator=True)

    for column in df.columns:
        if df[column].dtype == 'object':
            df[column] = imputer.fit_transform(df[column].values.reshape(-1, 1))
    return df


In [None]:
def KnnImputer(df):
    print("Using KNN Imputer to handle missing data")
    imputer = KNNImputer(missing_values=np.nan,n_neighbors=5, add_indicator=True)
    for column in df.columns:
        df[column] = imputer.fit_transform(df[column].values.reshape(-1, 1))
    return df



In [None]:
def visualisation(df):
    print("Visualising data")
    df.plot(kind='box', subplots = True, layout = (24,5), figsize = (30,40))
    
    df.plot(kind='scatter', subplots=True, layout=(24, 5), figsize=(30, 40))
    
    df = df.drop(df[(np.absolute(df['latitude']) <= 1) &
                 (np.absolute(df['longitude']) <= 1)].index)
    
    map_img = plt.imread('map.jpg')


    plot = sns.scatterplot(data=df, y='longitude', x='latitude',
                        hue='status_group', palette='colorblind', alpha=0.6, zorder=2)

    plot.imshow(map_img,
                aspect=plot.get_aspect(),
                extent=plot.get_xlim() + plot.get_ylim(),
                zorder=1)

    sns.countplot(data = df, x = 'status_group')
    

In [None]:
def oneHotEncoding(df):
    print("Using Binary Encoding to handle string Data")
    
    df_new = df.copy()

    encoder = LabelEncoder()
    df_new['status_group'] = encoder.fit_transform(df['status_group'].values)
    y = np_utils.to_categorical(df_new['status_group'])
    
    oneHot = OneHotEncoder(categories='auto', sparse=False, dtype=int, handle_unknown='ignore')
    columns = []
    for column in df.columns:
        if str(df_new[column].dtype) == "object":
            columns.append(column)

    print(columns)
        
    oneHot_encoded = pd.DataFrame(oneHot.fit_transform(df_new[columns]), index=df_new.index, columns=oneHot.get_feature_names(df_new[columns].columns))
    df_new.drop(columns, inplace = True, axis = 1)
    df_new = pd.concat([df_new,oneHot_encoded], axis = 1)
    
    return df_new, y



In [None]:
def labelEncoding(df):
    print("Using Label Encoder to handle string data")

    encoder_1 = LabelEncoder()
    label_columns = ['quantity_group', 'quality_group']

    df_new = df.copy()

    labels = {}

    for column in df.columns:
        if column in label_columns:
            df_new[column] = encoder_1.fit_transform(df[column].values)
            labels[column] = encoder_1.classes_

    graph = sns.barplot(data=df_new, x='quantity_group',
                        y='quality_group', hue='status_group')
    graph.set_xticklabels(labels['quantity_group'])
    graph.set_yticklabels(labels['quality_group'])

    encoder = LabelEncoder()
    #label_columns = ['basin','public_health','permit','water_quality','quantity','quantity_group','quality','quality_group','source','source_type','source_class']

    df['status_group'] = encoder.fit_transform(df['status_group'].values)
    y = np_utils.to_categorical(df['status_group'])

    for column in df.columns:
        if str(df[column].dtype) == "object":
            df[column] = encoder.fit_transform(df[column].values)
    
    df.head(10)
    
    return df, y

In [None]:
def heatmap_pearson(df):
    sns.heatmap(df.corr('pearson'))

In [None]:
def heatmap_spearman(df):
    sns.heatmap(df.corr('spearman'), cmap='crest')


In [None]:
#sns.heatmap(data=df_new, x='quantity_group', y='quality_group', hue='status_group')

In [None]:
def minMaxScaler(df):
    print("Using Min Max Scaler to Normalise the continuous data")

    normalize_columns = ["amount_tsh", "gps_height", "population"]

    for column in normalize_columns:
        df[column] = (df[column] - min(df[column])) / (max(df[column]) - min(df[column]))
    
    return df


In [None]:
def standardScaler(df):
    print("Using Standard Scaler to standardise the data")
    
    scaler = StandardScaler()
    
    normalize_columns = ["amount_tsh", "gps_height", "population"]
    
    scaled_df = scaler.fit_transform(df[normalize_columns].to_numpy())
    scaled_df = pd.DataFrame(scaled_df, columns=normalize_columns, index= df.index)
    df = df.drop(normalize_columns, axis = 1)
    df = pd.concat([df, scaled_df], axis=1)  
    return df    

In [None]:
def feature_selection_correlation(df):
    print("Using Pearson Correlation for feature selection")
    
    sns.heatmap(df.corr('pearson'))

    correlations = df.corr('pearson')

    corr_df = correlations[-1:]

    print(corr_df)

    fig, ax = plt.subplots(figsize=(120, 1))
    sns.heatmap(corr_df, annot=True, ax=ax)
    
    threshold = 0.01

    columns = list(corr_df.columns.values)
    row = corr_df.iloc[0]

    #test_columns = []

    #for column in columns:
    #    if absolute(corr_df.iloc[0][column]) < threshold:
    #        df.drop([column], axis=1, inplace=True)
    #    else:
    #        test_columns.append(column)
    #print(test_columns)
    
    return df


In [None]:
def feature_selection_importance(df):
    print("Using Random Forest for feature importance")

    X = df.drop('status_group', axis=1)
    y = df['status_group']

    feature_names = [f"feature {i}" for i in range(X.shape[1])]
    forest = RandomForestClassifier(random_state=0)
    forest.fit(X, y)
    
    importances = forest.feature_importances_
    print(importances)
    print(feature_names)    
    threshold = 0.01

    columns = list(corr_df.columns.values)
    row = corr_df.iloc[0]

    #test_columns = []

    #for column in columns:
    #    if absolute(corr_df.iloc[0][column]) < threshold:
    #        df.drop([column], axis=1, inplace=True)
    #    else:
    #        test_columns.append(column)
    #print(test_columns)

    return df

#feature_selection_importance(df_test)


In [272]:
def recursive_feature_selection(df):
    X = df.drop('status_group', axis=1)
    y = df['status_group']
    for i in range(1,len(X.columns)+1):
        rfe = RFE(DecisionTreeClassifier(), n_features_to_select = i)
        model = xgb.XGBClassifier(objective='multi:softprob')
        pipeline = Pipeline(steps = [('s', rfe),('m', model)])
        cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats=3 , random_state= 2)
        n_scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv = cv, n_jobs = -1, error_score='raise')
        print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
    

In [266]:
df_new = preprocessing(df)
df_new.head(10)

Preprocessing Data


Unnamed: 0,amount_tsh,gps_height,basin,population,public_meeting,permit,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,status_group
0,6000.0,1390,Lake Nyasa,109,True,False,soft,good,enough,enough,spring,spring,groundwater,functional
1,0.0,1399,Lake Victoria,280,,True,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,functional
2,25.0,686,Pangani,250,True,True,soft,good,enough,enough,dam,dam,surface,functional
3,0.0,263,Ruvuma / Southern Coast,58,True,True,soft,good,dry,dry,machine dbh,borehole,groundwater,non functional
4,0.0,0,Lake Victoria,0,True,True,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,functional
5,20.0,0,Pangani,1,True,True,salty,salty,enough,enough,other,other,unknown,functional
6,0.0,0,Internal,0,True,True,soft,good,enough,enough,machine dbh,borehole,groundwater,non functional
7,0.0,0,Lake Tanganyika,0,True,True,milky,milky,enough,enough,shallow well,shallow well,groundwater,non functional
8,0.0,0,Lake Tanganyika,0,True,True,salty,salty,seasonal,seasonal,machine dbh,borehole,groundwater,non functional
9,0.0,0,Lake Victoria,0,True,True,soft,good,enough,enough,shallow well,shallow well,groundwater,functional


In [267]:
df_test,y = oneHotEncoding(df_new)

Using Binary Encoding to handle string Data
['basin', 'public_meeting', 'permit', 'water_quality', 'quality_group', 'quantity', 'quantity_group', 'source', 'source_type', 'source_class']




In [268]:
df_test.head(5)

Unnamed: 0,amount_tsh,gps_height,population,status_group,basin_Internal,basin_Lake Nyasa,basin_Lake Rukwa,basin_Lake Tanganyika,basin_Lake Victoria,basin_Pangani,...,source_type_borehole,source_type_dam,source_type_other,source_type_rainwater harvesting,source_type_river/lake,source_type_shallow well,source_type_spring,source_class_groundwater,source_class_surface,source_class_unknown
0,6000.0,1390,109,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
1,0.0,1399,280,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,1,0
2,25.0,686,250,0,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,1,0
3,0.0,263,58,2,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
4,0.0,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,1,0


In [269]:
df_test.columns

Index(['amount_tsh', 'gps_height', 'population', 'status_group',
       'basin_Internal', 'basin_Lake Nyasa', 'basin_Lake Rukwa',
       'basin_Lake Tanganyika', 'basin_Lake Victoria', 'basin_Pangani',
       'basin_Rufiji', 'basin_Ruvuma / Southern Coast', 'basin_Wami / Ruvu',
       'public_meeting_False', 'public_meeting_True', 'public_meeting_nan',
       'permit_False', 'permit_True', 'permit_nan', 'water_quality_coloured',
       'water_quality_fluoride', 'water_quality_fluoride abandoned',
       'water_quality_milky', 'water_quality_salty',
       'water_quality_salty abandoned', 'water_quality_soft',
       'water_quality_unknown', 'quality_group_colored',
       'quality_group_fluoride', 'quality_group_good', 'quality_group_milky',
       'quality_group_salty', 'quality_group_unknown', 'quantity_dry',
       'quantity_enough', 'quantity_insufficient', 'quantity_seasonal',
       'quantity_unknown', 'quantity_group_dry', 'quantity_group_enough',
       'quantity_group_insuffic

In [270]:
df_test = KnnImputer(df_test)

Using KNN Imputer to handle missing data


In [273]:
recursive_feature_selection(df_test)

Accuracy: 0.675 (0.005)


In [None]:
def sequential_model(X_train, X_test, y_train, y_test):
    print("Sequential Model")
    model = Sequential()
    model.add(Dense(1000, activation='sigmoid', input_shape=(X_train.shape[1],)))
    model.add(Dense(500, activation='sigmoid'))
    model.add(Dense(250, activation='sigmoid'))
    model.add(Dense(200, activation='sigmoid'))
    model.add(Dense(100, activation='sigmoid'))
    model.add(Dense(50, activation='sigmoid'))
    model.add(Dense(25, activation='sigmoid'))
    model.add(Dense(3, activation='softmax'))

    es = keras.callbacks.EarlyStopping(monitor='val_loss',
                                    mode='min',
                                    patience=10,
                                    restore_best_weights=True)

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    model.fit(X_train, y_train, batch_size=128, epochs=500, shuffle=True, verbose=2)
    
    y_pred = model.predict(X_test)

    for y in y_pred:
        for i in range(len(y)):
            if y[i] > 0.5:
                y[i] = 1
            else:
                y[i] = 0
    
    return y_pred

In [None]:
def XGBoost(X_train, X_test, y_train, y_test):
    print("XGBoost Model")
    model = xgb.XGBClassifier(objective = 'multi:softprob', random_state = 1, n_estimators = 200, max_depth = 5)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(y_pred)
    
    return y_pred
    

In [None]:
def AdaBoost(X_train, X_test, y_train, y_test):
    print("ADABoost Model")
    parameters = {
        'n_estimators':[i for i in range(100,1000,100)],
        'learning_rate':[i*0.1 for i in range(1,10,1)]
    }
    model = AdaBoostClassifier(random_state=0)
    ada = GridSearchCV(model, parameters)
    ada.fit(X_train, y_train)
    y_pred = ada.predict(X_test)
    print(y_pred)
    print(ada.cv_results_)
    
    return y_pred

In [None]:
def GradientBoost(X_train, X_test, y_train, y_test):
    print("Gradient Boost Model")
    parameters = {
        'n_estimators': [i for i in range(100, 1000, 100)],
        'learning_rate': [i*0.1 for i in range(1, 10, 1)],
        'criterion': ['friedman_mse', 'squared_error']
    }
    model = GradientBoostingClassifier()
    gb = GridSearchCV(model, parameters)
    gb.fit(X_train, y_train)
    y_pred = gb.predict(X_test)
    print(y_pred)
    print(gb.cv_results_)

    return y_pred


In [None]:
def Report(y_test, y_pred):
    print("Generating Report")
    from sklearn.metrics import classification_report, confusion_matrix

    labels = ['0', '1', '2']

    report = classification_report(y_test, y_pred, target_names=labels)

    return report

In [None]:
visualisation(df)

In [None]:
def Approach_1(df):
    df = preprocessing(df)
    df = simpleImputer(df)
    df, y = labelEncoding(df)
    df = minMaxScaler(df)
    df = feature_selection_correlation(df)
    
    X = df.drop('status_group', axis=1)
    y = df['status_group']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True)
    print(X.shape)
    print(y.shape)
    print(X_train.shape)
    print(y_train.shape)
    print(X_test.shape)
    print(y_test.shape)
    y_pred = XGBoost(X_train, X_test, y_train, y_test)
    report = Report(y_test, y_pred)
    print(report)


In [None]:
def Approach_2(df):
    df = preprocessing(df)
    df = simpleImputer(df)
    df, y = labelEncoding(df)
    df = minMaxScaler(df)
    df = feature_selection_correlation(df)

    X = df.drop('status_group', axis=1)
    y = df['status_group']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)
    print(X.shape)
    print(y.shape)
    print(X_train.shape)
    print(y_train.shape)
    print(X_test.shape)
    print(y_test.shape)
    y_pred = AdaBoost(X_train, X_test, y_train, y_test)
    report = Report(y_test, y_pred)
    print(report)


In [None]:
def Approach_3(df):
    df = preprocessing(df)
    df = simpleImputer(df)
    df, y = labelEncoding(df)
    df = minMaxScaler(df)
    df = feature_selection_correlation(df)

    X = df.drop('status_group', axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)
    print(X.shape)
    print(y.shape)
    print(X_train.shape)
    print(y_train.shape)
    print(X_test.shape)
    print(y_test.shape)
    y_pred = sequential_model(X_train, X_test, y_train, y_test)
    report = Report(y_test, y_pred)
    print(report)


In [None]:
def Approach_4(df):
    df = preprocessing(df)
    df = simpleImputer(df)
    df, y = labelEncoding(df)
    df = minMaxScaler(df)
    df = feature_selection_correlation(df)

    X = df.drop('status_group', axis=1)
    y = df['status_group']
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, shuffle=True)
    print(X.shape)
    print(y.shape)
    print(X_train.shape)
    print(y_train.shape)
    print(X_test.shape)
    print(y_test.shape)
    y_pred = GradientBoost(X_train, X_test, y_train, y_test)
    report = Report(y_test, y_pred)
    print(report)


In [None]:
Approach_1(df)

In [None]:
Approach_2(df)

In [None]:
Approach_3(df)

In [None]:
Approach_4(df)

In [None]:
#X = np.array(X)

              precision    recall  f1-score   support

           0       0.70      0.81      0.75      6350
           1       0.54      0.09      0.15       890
           2       0.75      0.57      0.64      4640

   micro avg       0.71      0.66      0.69     11880
   macro avg       0.66      0.49      0.52     11880
weighted avg       0.71      0.66      0.67     11880
 samples avg       0.66      0.66      0.66     11880

              precision    recall  f1-score   support

           0       0.70      0.82      0.76      6404
           1       0.49      0.09      0.15       844
           2       0.77      0.52      0.63      4632

   micro avg       0.72      0.65      0.68     11880
   macro avg       0.65      0.48      0.51     11880
weighted avg       0.71      0.65      0.66     11880
 samples avg       0.65      0.65      0.65     11880

              precision    recall  f1-score   support

           0       0.72      0.82      0.77      6469
           1       0.40      0.11      0.18       839
           2       0.77      0.59      0.67      4572

   micro avg       0.73      0.68      0.70     11880
   macro avg       0.63      0.51      0.54     11880
weighted avg       0.72      0.68      0.69     11880
 samples avg       0.68      0.68      0.68     11880