In [73]:
# Getting data ready
import pandas as pd
import warnings
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from scipy import stats
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [74]:
# Ignore warnings
warnings.filterwarnings('ignore')

## Load Dataset

In [75]:
DATA_URL: str = "./Financial_inclusion_dataset.csv"

In [76]:
df = pd.read_csv(DATA_URL)

In [77]:
df.head()

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_1,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2,No,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3,Yes,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4,No,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5,No,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed


In [78]:
df.shape

(23524, 13)

## Reduce Data

In [79]:
# drop unrequired columns
df.drop("uniqueid", axis=1, inplace=True)
df.shape

(23524, 12)

In [80]:
df.drop_duplicates(inplace=True)
df.shape

(19095, 12)

## Handle NaN Values

In [81]:
df.head()

Unnamed: 0,country,year,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,No,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,Yes,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,No,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,No,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed


In [82]:
# Convert year, household_size, age_of_respondent to numerical values
df[["year", "household_size", "age_of_respondent"]] = df[["year", "household_size", "age_of_respondent"]].astype(int)

In [83]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19095 entries, 0 to 23523
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   country                 19095 non-null  object
 1   year                    19095 non-null  int32 
 2   bank_account            19095 non-null  object
 3   location_type           19095 non-null  object
 4   cellphone_access        19095 non-null  object
 5   household_size          19095 non-null  int32 
 6   age_of_respondent       19095 non-null  int32 
 7   gender_of_respondent    19095 non-null  object
 8   relationship_with_head  19095 non-null  object
 9   marital_status          19095 non-null  object
 10  education_level         19095 non-null  object
 11  job_type                19095 non-null  object
dtypes: int32(3), object(9)
memory usage: 1.7+ MB


In [84]:
df.head()

Unnamed: 0,country,year,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,No,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,Yes,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,No,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,No,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed


In [85]:
df.isna().sum()

country                   0
year                      0
bank_account              0
location_type             0
cellphone_access          0
household_size            0
age_of_respondent         0
gender_of_respondent      0
relationship_with_head    0
marital_status            0
education_level           0
job_type                  0
dtype: int64

## Encode Categorical Data

In [86]:
country_dict = {val: df["country"].unique().tolist().index(val)  for val in df["country"].unique()}
country_dict

{'Kenya': 0, 'Rwanda': 1, 'Tanzania': 2, 'Uganda': 3}

In [87]:
location_type_dict = {val: df["location_type"].unique().tolist().index(val)  for val in df["location_type"].unique()}
location_type_dict 

{'Rural': 0, 'Urban': 1}

In [88]:
gender_dict = {val: df["gender_of_respondent"].unique().tolist().index(val)  for val in df["gender_of_respondent"].unique()}
gender_dict 

{'Female': 0, 'Male': 1}

In [89]:
relationship_with_head_dict = {val: df["relationship_with_head"].unique().tolist().index(val)  for val in df["relationship_with_head"].unique()}
relationship_with_head_dict 

{'Spouse': 0,
 'Head of Household': 1,
 'Other relative': 2,
 'Child': 3,
 'Parent': 4,
 'Other non-relatives': 5}

In [90]:
marital_status_dict = {val: df["marital_status"].unique().tolist().index(val)  for val in df["marital_status"].unique()}
marital_status_dict 

{'Married/Living together': 0,
 'Widowed': 1,
 'Single/Never Married': 2,
 'Divorced/Seperated': 3,
 'Dont know': 4}

In [91]:
education_level_dict = {val: df["education_level"].unique().tolist().index(val)  for val in df["education_level"].unique()}
education_level_dict 

{'Secondary education': 0,
 'No formal education': 1,
 'Vocational/Specialised training': 2,
 'Primary education': 3,
 'Tertiary education': 4,
 'Other/Dont know/RTA': 5}

In [92]:
job_type_dict = {val: df["job_type"].unique().tolist().index(val)  for val in df["job_type"].unique()}
job_type_dict 

{'Self employed': 0,
 'Government Dependent': 1,
 'Formally employed Private': 2,
 'Informally employed': 3,
 'Formally employed Government': 4,
 'Farming and Fishing': 5,
 'Remittance Dependent': 6,
 'Other Income': 7,
 'Dont Know/Refuse to answer': 8,
 'No Income': 9}

In [93]:
cellphone_access_dict = {val: df["cellphone_access"].unique().tolist().index(val)  for val in df["cellphone_access"].unique()}
cellphone_access_dict 

{'Yes': 0, 'No': 1}

In [94]:
cat_cols = [val for val in df.select_dtypes(include=[object]).columns if val != "bank_account"]
cat_cols

['country',
 'location_type',
 'cellphone_access',
 'gender_of_respondent',
 'relationship_with_head',
 'marital_status',
 'education_level',
 'job_type']

In [95]:
df["country"] = df["country"].map(country_dict)
df["location_type"] = df["location_type"].map(location_type_dict)
df["gender_of_respondent"] = df["gender_of_respondent"].map(gender_dict)
df["relationship_with_head"] = df["relationship_with_head"].map(relationship_with_head_dict)
df["marital_status"] = df["marital_status"].map(marital_status_dict)
df["education_level"] = df["education_level"].map(education_level_dict)
df["job_type"] = df["job_type"].map(job_type_dict)
df["cellphone_access"] = df["cellphone_access"].map(cellphone_access_dict)
df["bank_account"] = df["bank_account"].map(cellphone_access_dict)

df.head()

Unnamed: 0,country,year,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,0,2018,0,0,0,3,24,0,0,0,0,0
1,0,2018,1,0,1,5,70,0,1,1,1,1
2,0,2018,0,1,0,5,26,1,2,2,2,0
3,0,2018,1,0,0,5,34,0,1,0,3,2
4,0,2018,1,1,1,8,26,1,3,2,3,3


## Model Training

In [104]:
# Split data into Train and Test Sets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

# Split into x & y
x = df.drop("bank_account", axis=1)
y = df["bank_account"]

# Split data into train and test
np.random.seed(42)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [97]:
x_train.head()

Unnamed: 0,country,year,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
7689,1,2016,0,0,5,36,0,0,0,1,5
21472,3,2018,0,0,7,52,1,1,0,3,0
18334,2,2017,1,1,2,78,1,1,2,1,0
5456,0,2018,1,1,3,58,0,1,3,0,6
428,0,2018,1,0,5,37,0,0,0,3,3


In [105]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [106]:
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [107]:
clf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator=clf, param_distributions=random_grid, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)
rf_random.fit(x_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [108]:
def evaluate_preds(y_true, y_preds):
    """
    Performs evaluation comparison on y_true labels vs. y_pred labels
    on a classification.
    """
    accuracy = accuracy_score(y_true, y_preds)
    precision = precision_score(y_true, y_preds)
    recall = recall_score(y_true, y_preds)
    f1 = f1_score(y_true, y_preds)
    metric_dict = {"accuracy": round(accuracy, 2),
                   "precision": round(precision, 2),
                   "recall": round(recall, 2),
                   "f1": round(f1, 2)}
    print(f"Acc: {accuracy * 100:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 score: {f1:.2f}")
    
    return metric_dict

In [110]:
# Make baseline predictions
y_preds = rf_random.predict(x_test)

# Evaluate the classifier on validation set
baseline_metrics = evaluate_preds(y_test, y_preds)

Acc: 86.20%
Precision: 0.88
Recall: 0.97
F1 score: 0.92


## Export Model

In [116]:
import pickle

filename = "financial_inclusion_model.pickle"
with open(filename, "wb") as f:
    pickle.dump(rf_random, f)

In [117]:
pred_data  = {'country': [3], 'year': [2022], 'location_type': [1], 'cellphone_access': [0], 'household_size': [1], 'age_of_respondent': [18], 'gender_of_respondent': [1], 'relationship_with_head': [4], 'marital_status': [1], 'education_level': [3], 'job_type': [5]}

pred_df = pd.DataFrame(pred_data)


In [118]:
with open(filename, "rb") as f:
    clr = pickle.load(f)

clr.predict(pred_df)[0]

array([1], dtype=int64)