In [56]:
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, confusion_matrix, accuracy_score, classification_report
import pickle

In [57]:
# Load the dataset
df = pd.read_csv("data.csv")
print("Initial shape of the dataset:", df.shape)

Initial shape of the dataset: (100000, 28)


In [58]:
# Option 1: Remove rows with missing values
df_cleaned = df.dropna()

In [59]:
# Option 2: Impute missing values (replace with mean or mode)
# For numerical columns
mean_values = df.mean()
df_imputed = df.fillna(mean_values)

  mean_values = df.mean()


In [None]:
# For categorical columns
mode_values = df.mode().iloc[0]
df_imputed = df.fillna(mode_values)

In [47]:
# Step 1: Remove duplicate data entries based on customer ID
df = df.drop_duplicates(subset=["Customer_ID"])
df

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,5634,3392,1,Aaron Maashoh,23,821000265,Scientist,19114.12,1824.843333,3,...,Good,809.98,26.822620,265,No,49.574949,21.465380,High_spent_Small_value_payments,312.494089,Good
8,5646,8625,1,Rick Rothackerj,28,4075839,Teacher,34847.84,3037.986667,2,...,Good,605.03,24.464031,319,No,18.816215,39.684018,Low_spent_Small_value_payments,470.690627,Standard
16,5658,11708,1,Langep,34,486853974,Engineer,143162.64,12187.220000,1,...,Good,1303.01,28.616735,213,No,246.992320,168.413703,High_spent_Small_value_payments,1043.315978,Good
24,5670,47249,1,Jasond,54,72316145,Entrepreneur,30689.89,2612.490833,2,...,Good,632.46,26.544229,207,No,16.415452,29.393311,Low_spent_Large_value_payments,433.604773,Standard
32,5682,7387,1,Deepaa,21,615067821,Developer,35547.71,2853.309167,7,...,Standard,943.86,39.797764,368,Yes,0.000000,37.643638,High_spent_Medium_value_payments,288.605522,Standard
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99960,155574,14124,1,Lucia Mutikanik,18,340857301,Lawyer,42903.79,3468.315833,0,...,Good,1079.48,27.289440,337,No,34.975457,31.193919,High_spent_Small_value_payments,493.341182,Good
99968,155586,3862,1,Maria Sheahanb,44,868702218,Media_Manager,16680.35,1528.029167,1,...,Good,897.16,39.868572,254,NM,41.113561,40.893052,High_spent_Small_value_payments,318.737378,Good
99976,155598,44897,1,Chris Wickhamm,49,133167738,Writer,37188.10,3097.008333,1,...,Good,620.64,39.080823,357,No,84.205949,42.935566,Low_spent_Small_value_payments,291.619866,Good
99984,155610,34304,1,Sarah McBridec,28,31350942,Architect,20002.88,1929.906667,10,...,Bad,3571.70,22.895966,68,Yes,60.964772,34.662906,High_spent_Large_value_payments,328.655224,Poor


In [48]:
# Step 2: Eliminate unwanted variables like customer ID, name, etc.
unwanted_columns = ["ID", "Customer_ID", "Name", "SSN", "Type_of_Loan"]
df.drop(unwanted_columns, axis=1, inplace=True)
df

Unnamed: 0,Month,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,1,23,Scientist,19114.12,1824.843333,3,4,3,4,3,...,Good,809.98,26.822620,265,No,49.574949,21.465380,High_spent_Small_value_payments,312.494089,Good
8,1,28,Teacher,34847.84,3037.986667,2,4,6,1,3,...,Good,605.03,24.464031,319,No,18.816215,39.684018,Low_spent_Small_value_payments,470.690627,Standard
16,1,34,Engineer,143162.64,12187.220000,1,5,8,3,5,...,Good,1303.01,28.616735,213,No,246.992320,168.413703,High_spent_Small_value_payments,1043.315978,Good
24,1,54,Entrepreneur,30689.89,2612.490833,2,5,4,1,0,...,Good,632.46,26.544229,207,No,16.415452,29.393311,Low_spent_Large_value_payments,433.604773,Standard
32,1,21,Developer,35547.71,2853.309167,7,5,5,0,5,...,Standard,943.86,39.797764,368,Yes,0.000000,37.643638,High_spent_Medium_value_payments,288.605522,Standard
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99960,1,18,Lawyer,42903.79,3468.315833,0,4,6,1,14,...,Good,1079.48,27.289440,337,No,34.975457,31.193919,High_spent_Small_value_payments,493.341182,Good
99968,1,44,Media_Manager,16680.35,1528.029167,1,1,5,4,4,...,Good,897.16,39.868572,254,NM,41.113561,40.893052,High_spent_Small_value_payments,318.737378,Good
99976,1,49,Writer,37188.10,3097.008333,1,4,5,3,7,...,Good,620.64,39.080823,357,No,84.205949,42.935566,Low_spent_Small_value_payments,291.619866,Good
99984,1,28,Architect,20002.88,1929.906667,10,8,29,5,35,...,Bad,3571.70,22.895966,68,Yes,60.964772,34.662906,High_spent_Large_value_payments,328.655224,Poor


In [49]:
df1=pd.get_dummies(df)
df1.head()

Unnamed: 0,Month,Age,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,...,Payment_of_Min_Amount_Yes,Payment_Behaviour_High_spent_Large_value_payments,Payment_Behaviour_High_spent_Medium_value_payments,Payment_Behaviour_High_spent_Small_value_payments,Payment_Behaviour_Low_spent_Large_value_payments,Payment_Behaviour_Low_spent_Medium_value_payments,Payment_Behaviour_Low_spent_Small_value_payments,Credit_Score_Good,Credit_Score_Poor,Credit_Score_Standard
0,1,23,19114.12,1824.843333,3,4,3,4,3,7,...,0,0,0,1,0,0,0,1,0,0
8,1,28,34847.84,3037.986667,2,4,6,1,3,4,...,0,0,0,0,0,0,1,0,0,1
16,1,34,143162.64,12187.22,1,5,8,3,5,8,...,0,0,0,1,0,0,0,1,0,0
24,1,54,30689.89,2612.490833,2,5,4,1,0,6,...,0,0,0,0,1,0,0,0,0,1
32,1,21,35547.71,2853.309167,7,5,5,0,5,15,...,1,0,1,0,0,0,0,0,0,1


In [50]:
df1.drop(["Occupation_Writer","Credit_Mix_Standard","Payment_of_Min_Amount_NM",
       "Payment_Behaviour_Low_spent_Small_value_payments","Credit_Score_Standard"],axis=1,inplace=True)
df1

Unnamed: 0,Month,Age,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,...,Credit_Mix_Good,Payment_of_Min_Amount_No,Payment_of_Min_Amount_Yes,Payment_Behaviour_High_spent_Large_value_payments,Payment_Behaviour_High_spent_Medium_value_payments,Payment_Behaviour_High_spent_Small_value_payments,Payment_Behaviour_Low_spent_Large_value_payments,Payment_Behaviour_Low_spent_Medium_value_payments,Credit_Score_Good,Credit_Score_Poor
0,1,23,19114.12,1824.843333,3,4,3,4,3,7,...,1,1,0,0,0,1,0,0,1,0
8,1,28,34847.84,3037.986667,2,4,6,1,3,4,...,1,1,0,0,0,0,0,0,0,0
16,1,34,143162.64,12187.220000,1,5,8,3,5,8,...,1,1,0,0,0,1,0,0,1,0
24,1,54,30689.89,2612.490833,2,5,4,1,0,6,...,1,1,0,0,0,0,1,0,0,0
32,1,21,35547.71,2853.309167,7,5,5,0,5,15,...,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99960,1,18,42903.79,3468.315833,0,4,6,1,14,0,...,1,1,0,0,0,1,0,0,1,0
99968,1,44,16680.35,1528.029167,1,1,5,4,4,0,...,1,0,0,0,0,1,0,0,1,0
99976,1,49,37188.10,3097.008333,1,4,5,3,7,12,...,1,1,0,0,0,0,0,0,1,0
99984,1,28,20002.88,1929.906667,10,8,29,5,35,25,...,0,0,1,1,0,0,0,0,0,1


In [51]:
df1.head()
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12500 entries, 0 to 99992
Data columns (total 43 columns):
 #   Column                                              Non-Null Count  Dtype  
---  ------                                              --------------  -----  
 0   Month                                               12500 non-null  int64  
 1   Age                                                 12500 non-null  int64  
 2   Annual_Income                                       12500 non-null  float64
 3   Monthly_Inhand_Salary                               12500 non-null  float64
 4   Num_Bank_Accounts                                   12500 non-null  int64  
 5   Num_Credit_Card                                     12500 non-null  int64  
 6   Interest_Rate                                       12500 non-null  int64  
 7   Num_of_Loan                                         12500 non-null  int64  
 8   Delay_from_due_date                                 12500 non-null  int64  


In [55]:
# Step 4: Splitting the dataset for regression analysis
dep = df_dummies["Monthly_Balance"]
ind = df_dummies.drop(["Monthly_Balance"], axis=1)
ind

Unnamed: 0,Month,Age,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,...,Occupation_Journalist,Occupation_Lawyer,Occupation_Manager,Occupation_Mechanic,Occupation_Media_Manager,Occupation_Musician,Occupation_Scientist,Occupation_Teacher,Occupation_Writer,Credit_Score_Poor
0,1,23,19114.12,1824.843333,3,4,3,4,3,7,...,0,0,0,0,0,0,1,0,0,0
8,1,28,34847.84,3037.986667,2,4,6,1,3,4,...,0,0,0,0,0,0,0,1,0,0
16,1,34,143162.64,12187.220000,1,5,8,3,5,8,...,0,0,0,0,0,0,0,0,0,0
24,1,54,30689.89,2612.490833,2,5,4,1,0,6,...,0,0,0,0,0,0,0,0,0,0
32,1,21,35547.71,2853.309167,7,5,5,0,5,15,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99960,1,18,42903.79,3468.315833,0,4,6,1,14,0,...,0,1,0,0,0,0,0,0,0,0
99968,1,44,16680.35,1528.029167,1,1,5,4,4,0,...,0,0,0,0,1,0,0,0,0,0
99976,1,49,37188.10,3097.008333,1,4,5,3,7,12,...,0,0,0,0,0,0,0,0,1,0
99984,1,28,20002.88,1929.906667,10,8,29,5,35,25,...,0,0,0,0,0,0,0,0,0,1


In [53]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(ind,dep,train_size=0.75)

In [54]:
# Step 6: Linear Regression model
linear_model = LinearRegression()
linear_model.fit(x_train, y_train)
pred_linear = linear_model.predict(x_test)
r2_linear = r2_score(y_test, pred_linear)
plt.figure(figsize=(10, 5))
sns.regplot(y_test, pred_linear, color="blue")
print("Linear Regression - R-Square:", round(r2_linear, 4))
print("Linear Regression - RMSE:", mean_squared_error(y_test, pred_linear, squared=False))


ValueError: could not convert string to float: 'Standard'

In [36]:
# Step 5: Split the dataset for regression analysis
dep = df_encoded["Monthly_Balance"]
ind = df_encoded.drop(["Monthly_Balance"], axis=1)

x_train, x_test, y_train, y_test = train_test_split(ind, dep, train_size=0.75)


In [37]:
# Step 6: Linear Regression model
linear_model = LinearRegression()
linear_model.fit(x_train, y_train)
pred_linear = linear_model.predict(x_test)
r2_linear = r2_score(y_test, pred_linear)

plt.figure(figsize=(10, 5))
sns.regplot(pred_linear, y_test, color="blue")  # Corrected plot arguments

print("Linear Regression - R-Square:", round(r2_linear, 4))
print("Linear Regression - RMSE:", mean_squared_error(y_test, pred_linear, squared=False))





ValueError: could not convert string to float: 'Bad'

In [23]:
df_encoded

Unnamed: 0,Month,Age,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,...,Credit_Mix_Good,Payment_of_Min_Amount_No,Payment_of_Min_Amount_Yes,Payment_Behaviour_High_spent_Large_value_payments,Payment_Behaviour_High_spent_Medium_value_payments,Payment_Behaviour_High_spent_Small_value_payments,Payment_Behaviour_Low_spent_Large_value_payments,Payment_Behaviour_Low_spent_Medium_value_payments,Credit_Score_Good,Credit_Score_Poor
0,1,23,19114.12,1824.843333,3,4,3,4,3,7,...,1,1,0,0,0,1,0,0,1,0
8,1,28,34847.84,3037.986667,2,4,6,1,3,4,...,1,1,0,0,0,0,0,0,0,0
16,1,34,143162.64,12187.220000,1,5,8,3,5,8,...,1,1,0,0,0,1,0,0,1,0
24,1,54,30689.89,2612.490833,2,5,4,1,0,6,...,1,1,0,0,0,0,1,0,0,0
32,1,21,35547.71,2853.309167,7,5,5,0,5,15,...,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99960,1,18,42903.79,3468.315833,0,4,6,1,14,0,...,1,1,0,0,0,1,0,0,1,0
99968,1,44,16680.35,1528.029167,1,1,5,4,4,0,...,1,0,0,0,0,1,0,0,1,0
99976,1,49,37188.10,3097.008333,1,4,5,3,7,12,...,1,1,0,0,0,0,0,0,1,0
99984,1,28,20002.88,1929.906667,10,8,29,5,35,25,...,0,0,1,1,0,0,0,0,0,1


In [None]:
# Logistic Regression
df_logistic = df.copy()
df_logistic["Credit_Score"] = df_logistic["Credit_Score"].replace({"Good": 1, "Standard": 2, "Poor": 3})
df_logistic=pd.get_dummies(df_logistic)
dep_var = df_logistic["Credit_Score"]
ind_var = df_logistic.drop(["Credit_Score"], axis=1)
x_train_log, x_test_log, y_train_log, y_test_log = train_test_split(ind_var, dep_var, train_size=0.75, random_state=42)


In [29]:
# Logistic Regression Model
logreg_model = LogisticRegression()
logreg_model.fit(x_train_log, y_train_log)
logreg_pred = logreg_model.predict(x_test_log)

ValueError: could not convert string to float: 'Julienx'

In [15]:
# Logistic Regression Model
logreg_model = LogisticRegression(multi_class='multinomial', solver='lbfgs')
logreg_model.fit(x_train_log, y_train_log)
logreg_pred = logreg_model.predict(x_test_log)


ValueError: could not convert string to float: 'Julienx'