# Financial Inclusion in Africa Zindi Competition


Remember to read the competition details in the zindi platform, [Zindi Africa](https://zindi.africa)

In [None]:
# import important modules 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns  
import warnings
np.random.seed(123)
warnings.filterwarnings('ignore')
%matplotlib inline 

In [None]:

from google.colab import drive
drive.mount('/content/drive')

## Load Dataset

In [None]:
# Import data
train_data = pd.read_csv('Train.csv')
test_data = pd.read_csv('Test.csv')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# print shape 
print('train data shape :', train_data.shape)
print('test data shape :', test_data.shape)

The above output show the number of rows and columns for train and test dataset 

In [None]:
# Inspect train Data
train_data.head()

**Note:** Open the VariableDefinition file to understand the meaning of each variable in this dataset 

In [None]:
submission = pd.read_csv('SampleSubmission.csv')
submission.head()

## Explolatory Data Analysis 

This is the process of finding some insights from you dataset before create predictive models.

**Note:** This is important steps in your Data science workflow.

In [None]:
#show list of columns 
list(train_data.columns)  

In [None]:
## show Some information about the dataset 
print(train_data.info())

The outshow shows the list of variables , sizes and data types in each variables. This will help you to know what feature engineering techniques you can apply.

In [None]:
# Check for missing values
print('missing values:', train_data.isnull().sum())

We don't have missing data in our dataset.

In [None]:
# Explore Target distribution 

sns.catplot(x="bank_account", kind="count", data=train_data)

In [None]:
train_data['bank_account'].value_counts()

The data shows that we have large number of **no** class than **yes** class  in our target variable 

In my previous article , I explained more about Exploratory data analysis with the financial Inclusion in Africa dataset.You can read and download the notebook [here](https://medium.com/analytics-vidhya/why-you-need-to-explore-your-data-how-you-can-start-13de6f29c8c1)

## Data Preprocessing 

In [None]:
test_data = test_data.drop(['year'],axis=1)
# test_data = test_data.drop(['household_size'],axis=1)


In [None]:
#import preprocessing module 
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler 

# Convert target label to numerical Data
le = LabelEncoder()
train_data['bank_account'] = le.fit_transform(train_data['bank_account'])

#Separate training features from target
train_data = train_data.drop(['year'], axis=1)
# train_data = train_data.drop(['household_size'], axis=1)

X_train = train_data.drop(['bank_account'], axis=1)

y_train = train_data['bank_account']

# y_train = pd.DataFrame(train_data['bank_account'])

X_train.head()

In [None]:
# our target 
y_train.head() 

In [None]:
X_train = X_train.drop(['year'],axis=1)

In [None]:
X_train = X_train.drop(['household_size'],axis=1)

In [None]:
X_train = X_train.drop(['household_size'],axis=1)

I have created a simple preprocessing function to handle 

- conversion of data types
- convert categorical features to numerical by one-hot encoding method and label encoding
- drop uniqueid variable 
- scaling our data into range of 0 and 1 

In [None]:
# function to preprocess our data from train models

def preprocessing_data(data):

    # Convert the following numerical labels from integer to float
    float_array = data[["household_size","age_of_respondent"]].values.astype(
        float
    )

    # categorical features to be converted by One Hot Encoding
    categ = [
        "relationship_with_head",
        "marital_status",
        "education_level",
        "job_type",
        "country",
    ]

    # One Hot Encoding conversion
    data = pd.get_dummies(data, prefix_sep="_", columns=categ)

    # Label Encoder conversion
    data["location_type"] = le.fit_transform(data["location_type"])
    data["cellphone_access"] = le.fit_transform(data["cellphone_access"])
    data["gender_of_respondent"] = le.fit_transform(data["gender_of_respondent"])

    # drop uniquid column
    data = data.drop(["uniqueid"], axis=1)
    data = data.drop(["cellphone_access"], axis=1)

    # scale our data into range of 0 and 1
    scaler = MinMaxScaler(feature_range=(0, 1))
    data = scaler.fit_transform(data)

    return data

In [None]:
# preprocess the train data 
processed_train_data = preprocessing_data(X_train)

In [None]:
# the first train row 
processed_train_data.shape

In [None]:
# preprocess the test data
processed_test_data = preprocessing_data(test_data)

In [None]:
# the first test row
processed_test_data.shape

In [None]:
# shape of the processed train set
processed_train_data.shape 

In [None]:
# shape of the processed test set
processed_test_data

### Model Building and Experiment 

In [None]:
# Split train_data
from sklearn.model_selection import train_test_split

X_Train, X_val, y_Train, y_val = train_test_split(processed_train_data, y_train, stratify = y_train,
                                                  test_size = 0.1, random_state=42)

In [None]:
#import classifier algorithm here 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier 
import lightgbm as lgb
from sklearn.svm import SVC 

# create models 
lg_model = LogisticRegression()
rf_model = RandomForestClassifier()
kn_model = KNeighborsClassifier()
et_model = ExtraTreesClassifier()
xg_model = XGBClassifier()
lgb_model = lgb.LGBMClassifier(learning_rate=0.09,max_depth=-5,random_state=42)
SVC_model = SVC(kernel='linear',random_state = 0)


#fitting the models
lg_model.fit(X_Train,y_Train)
rf_model.fit(X_Train,y_Train)
kn_model.fit(X_Train,y_Train)
et_model.fit(X_Train,y_Train)
xg_model.fit(X_Train,y_Train)
lgb_model.fit(X_Train,y_Train,eval_set=[( X_val,y_val),(X_Train,y_Train)],verbose=20,eval_metric='logloss')
SVC_model.fit(X_Train,y_Train)


In [None]:
models = {"lg_model" : LogisticRegression(),
"rf_model" : RandomForestClassifier(),
"kn_model" : KNeighborsClassifier(),
"et_model" : ExtraTreesClassifier(),
"xg_model" : XGBClassifier(),
"lgb_model" : lgb.LGBMClassifier(),
"SVC_model" : SVC()


}
def fit_and_score(models , X_Train , X_val , y_Train , y_val):
  model_scores = {}
  for name, model in models.items():
    model.fit(X_Train,y_Train)
    model_scores[name] = model.score(X_val,y_val)

  return model_scores

In [None]:
model_scores = fit_and_score(models=models,
                             X_Train=X_Train,
                             X_val=X_val,
                             y_Train=y_Train,
                             y_val=y_val) 

In [None]:
model_scores

In [None]:
# {'SVC_model': 0.8916277093072673,
#  'et_model': 0.8559286017849553,
#  'kn_model': 0.8810029749256268,
#  'lg_model': 0.8861028474288143,
#  'lgb_model': 0.8895027624309392,
#  'rf_model': 0.8661283467913302,
#  'xg_model': 0.8899277518062049}

In [None]:
model_compare = pd.DataFrame(model_scores,index=["accuracy"])
model_compare.T.plot.bar()

In [None]:
# import evaluation metrics
from sklearn.metrics import confusion_matrix, accuracy_score

# evaluate the model
lg_y_pred = lg_model.predict(X_val)
rf_y_pred = rf_model.predict(X_val)
kn_y_pred = kn_model.predict(X_val)
et_y_pred = et_model.predict(X_val)
xg_y_pred = xg_model.predict(X_val)
lgb_y_pred = lgb_model.predict(X_val)
SVC_y_pred = SVC_model.predict(X_val)

# Get error rate
print("Error rate of Logistic Regression classifier: ", 1 - accuracy_score(y_val, lg_y_pred))
print("Error rate of Random Forest classifier: ", 1 - accuracy_score(y_val, rf_y_pred))
print("Error rate of KNeighbors Classifier: ", 1 - accuracy_score(y_val, kn_y_pred))
print("Error rate of Extra Tree classifier: ", 1 - accuracy_score(y_val, et_y_pred))
print("Error rate of XGB classifier: ", 1 - accuracy_score(y_val, xg_y_pred))
print("Error rate of lGB classifier: ", 1 - accuracy_score(y_val, lgb_y_pred))
print("Error rate of lGB classifier: ", 1 - accuracy_score(y_val, lgb_y_pred))
print("Error rate of SVC classifier: ", 1 - accuracy_score(y_val, SVC_y_pred))



In [None]:
# Error rate of Logistic Regression classifier:  0.11219719507012327
# Error rate of Random Forest classifier:  0.13557161070973223
# Error rate of KNeighbors Classifier:  0.11814704632384188
# Error rate of Extra Tree classifier:  0.14109647258818525
# Error rate of XGB classifier:  0.10922226944326396
# Error rate of lGB classifier:  0.10879728006799827

XGB Classifier performs better than others classifiers.

let's check the confusion matrix for XGB Classifier

In [None]:
from sklearn.metrics import plot_confusion_matrix

# Get confusion matrix for Gradient Boosting Classifier 
plot_confusion_matrix(xg_model,X_val, y_val,normalize='true')

In [None]:
# Import GridSearchCV
from sklearn.model_selection import GridSearchCV

In [None]:
# Optimize model paramaters 
# I run this code in google colab to make the execution much faster and use the best params in the next code
param_grid = {'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1],
        'subsample': [0.6, 0.8, 1.0],
        'max_depth': [3,5]
        }
my_xgb_model = GridSearchCV(xg_model, param_grid,n_jobs=-1,verbose=2,cv=5)
my_xgb_model.fit(X_Train, y_Train)
print(my_xgb_model.best_params_)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score


# fit by setting best parameters and Evaluate model
my_xgb_model = XGBClassifier(min_child_weight=1, gamma=1, subsample=0.8, max_depth=5)

my_xgb_model.fit(X_Train, y_Train)
y_pred = my_xgb_model.predict(X_val)

# Get error rate
print("Error rate of the  XGB classifier: ", 1 - accuracy_score(y_val, y_pred))

In [None]:
# Error rate of the  XGB classifier:  0.10879728006799827

Our model has been improved 

In [None]:
# Get the predicted result for the test Data
test_data.bank_account =my_xgb_model.predict(processed_test_data)

In [None]:
# create submission DataFrame
submission = pd.DataFrame({"uniqueid": test_data["uniqueid"] + " x " + test_data["country"],

"bank_account": test_data.bank_account})

In [None]:
#show the five sample
submission.sample(50)

In [None]:
# Create submission csv file
submission.to_csv('submission_16.csv', index = False)

Upload your **first_submission.csv** in the zindi competition

In [None]:
# from sklearn.model_selection import train_test_split

# X_Train, X_val, y_Train, y_val = train_test_split(processed_train_data, y_train, stratify = y_train,
#                                                   test_size = 0.1, random_state=42)

In [None]:
# import lightgbm as lgb
# lgb_train = lgb.Dataset(X_Train, y_Train)
# lgb_test = lgb.Dataset(X_val, y_val)

In [None]:
# params = {
# 'boosting_type': 'gbdt',
# 'objective': 'multiclass',
# 'metric': 'multi_logloss',
# 'num_class':9
# }

In [None]:
# gbm = lgb.train(
# params,
# lgb_train,
# num_boost_round=500,
# valid_sets=[lgb_train, lgb_test],
# early_stopping_rounds=10
# )


In [None]:

# test_data.bank_account = lgm.predict(processed_test_data)


In [None]:
# submission = pd.DataFrame({"uniqueid": test_data["uniqueid"] + " x " + test_data["country"],

# "bank_account": test_data.bank_account})