In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
df = pd.read_csv(r'C:\Users\shayma\OneDrive\Desktop\dtries\Financial_inclusion_dataset.csv')



In [13]:
df.head()

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_1,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2,No,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3,Yes,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4,No,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5,No,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed


In [14]:
print("Statistical Summary:")
print(df.describe())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())



Statistical Summary:
               year  household_size  age_of_respondent
count  23524.000000    23524.000000       23524.000000
mean    2016.975939        3.797483          38.805220
std        0.847371        2.227613          16.520569
min     2016.000000        1.000000          16.000000
25%     2016.000000        2.000000          26.000000
50%     2017.000000        3.000000          35.000000
75%     2018.000000        5.000000          49.000000
max     2018.000000       21.000000         100.000000

Missing Values:
country                   0
year                      0
uniqueid                  0
bank_account              0
location_type             0
cellphone_access          0
household_size            0
age_of_respondent         0
gender_of_respondent      0
relationship_with_head    0
marital_status            0
education_level           0
job_type                  0
dtype: int64


In [15]:
df.duplicated().sum()

np.int64(0)

In [16]:
# Handle outliers (example for age_of_respondent)
Q1 = df['age_of_respondent'].quantile(0.25)
Q3 = df['age_of_respondent'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df = df[(df['age_of_respondent'] >= lower_bound) & (df['age_of_respondent'] <= upper_bound)]

# Encode categorical features
categorical_columns = ['country', 'location_type', 'cellphone_access', 'gender_of_respondent', 
                      'relationship_with_head', 'marital_status', 'education_level', 'job_type']

label_encoders = {}
for column in categorical_columns:
    if column in df.columns:
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column].astype(str))
        label_encoders[column] = le

# Prepare features and target
X = df.drop(['bank_account', 'uniqueid'], axis=1)  # Assuming 'bank_account' is the target
y = df['bank_account']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Save the model and encoders
with open(r'C:\Users\shayma\OneDrive\Desktop\dtries\str01\rf_model.pkl', 'wb') as f:
    pickle.dump(rf_model, f)

with open(r'C:\Users\shayma\OneDrive\Desktop\dtries\str01\label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)

with open(r'C:\Users\shayma\OneDrive\Desktop\dtries\str01\feature_columns.pkl', 'wb') as f:
    pickle.dump(list(X.columns), f)


Model Accuracy: 0.8636

Classification Report:
              precision    recall  f1-score   support

          No       0.90      0.95      0.92      4001
         Yes       0.52      0.34      0.41       656

    accuracy                           0.86      4657
   macro avg       0.71      0.65      0.67      4657
weighted avg       0.85      0.86      0.85      4657

