In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')


In [32]:
#Importing data 
scoredf = pd.read_csv('creditscore.csv')
scoredf.head()

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,0x1602,CUS_0xd40,January,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,_,809.98,26.82262,22 Years and 1 Months,No,49.574949,80.41529543900253,High_spent_Small_value_payments,312.49408867943663,Good
1,0x1603,CUS_0xd40,February,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.94496,,No,49.574949,118.28022162236736,Low_spent_Large_value_payments,284.62916249607184,Good
2,0x1604,CUS_0xd40,March,Aaron Maashoh,-500,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,28.609352,22 Years and 3 Months,No,49.574949,81.699521264648,Low_spent_Medium_value_payments,331.2098628537912,Good
3,0x1605,CUS_0xd40,April,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.377862,22 Years and 4 Months,No,49.574949,199.4580743910713,Low_spent_Small_value_payments,223.45130972736783,Good
4,0x1606,CUS_0xd40,May,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,Good,809.98,24.797347,22 Years and 5 Months,No,49.574949,41.420153086217326,High_spent_Medium_value_payments,341.48923103222177,Good


In [33]:
#Removing unwanted characters
scoredf = scoredf.replace('_', '', regex=True)

In [34]:
#Checking for duplicates
scoredf['ID'].duplicated().sum()

0

In [35]:
scoredf['Customer_ID'].duplicated().sum()

87500

In [36]:
#Converting datatypes
scoredf['Age'] = scoredf['Age'].astype(int)
scoredf['Annual_Income'] = scoredf['Annual_Income'].astype(float)
scoredf['Monthly_Balance'] = scoredf['Monthly_Balance'].astype(float)
scoredf['Num_of_Delayed_Payment'] = scoredf['Num_of_Delayed_Payment'].astype(float)


In [37]:
#Removing outliers
scoredf['Age'] =  scoredf['Age'][(scoredf['Age'] >= 0) & (scoredf['Age'] <= 100)]

In [38]:
scoredf['Age'].max()

100.0

In [39]:
#Defining functions
def replace_with_mode(df, group_col, target_col , new_col_name):
    mode_df = df.groupby(group_col)[target_col].agg(lambda x: x.mode().iloc[0] if not x.mode().iloc[0] == '' else np.nan ).reset_index()
    df = df.merge(mode_df, on=group_col, suffixes=('', '_mode'))
    df.rename(columns={f'{target_col}_mode': new_col_name}, inplace=True)
    df.drop(columns=[target_col], inplace=True)
    return df

In [40]:
#Filling missing values
user_mean_ages = scoredf.groupby('Customer_ID')['Age'].mean().round()
df_merged = scoredf.merge(user_mean_ages.rename('Mean_Age'), on='Customer_ID')
df_merged['Age'] = df_merged['Age'].fillna(df_merged['Mean_Age'])

scoredf = df_merged.drop(columns=['Mean_Age'])

In [41]:
#Replacing with mode
num_credit = scoredf.groupby('Customer_ID')['Num_Credit_Card'].min().reset_index()
mode_credit = scoredf.merge(num_credit, on='Customer_ID').rename(columns={'Num_Credit_Card_y': 'num_credit_cards'})
mode_credit.drop(columns=['Num_Credit_Card_x'], inplace=True)

In [42]:
#Replacing with mode
mode_credit = replace_with_mode(mode_credit, 'Customer_ID', 'Num_of_Loan' , 'num_of_loan')
mode_credit = replace_with_mode(mode_credit, 'Customer_ID', 'Num_Bank_Accounts' , 'num_bank_accounts')
mode_credit = replace_with_mode(mode_credit, 'Customer_ID', 'Interest_Rate' , 'intrest_rate')
mode_credit = replace_with_mode(mode_credit, 'Customer_ID', 'Occupation' , 'occupation')
mode_credit = replace_with_mode(mode_credit, 'Customer_ID', 'Monthly_Inhand_Salary' , 'monthly_inhand_salary')


In [43]:
#Removing missing values
mode_credit.dropna(subset=['occupation'], inplace=True)

In [44]:
#Filling missing values
name = mode_credit['Name'].groupby(mode_credit['Customer_ID']).transform('first')
mode_credit['Name'] = mode_credit['Name'].fillna(name)


In [45]:
mode_credit['Monthly_Balance'] = mode_credit['Monthly_Balance'].dropna().apply(lambda x: x if x >= 0 else 0)
mode_credit = mode_credit[mode_credit['Delay_from_due_date'] >= 0]
mode_credit = mode_credit[mode_credit['Num_of_Delayed_Payment'] >= 0]


In [46]:
mode_credit.dropna(subset=['Monthly_Balance'], inplace=True)
mode_credit['Monthly_Balance'].isna().sum()

0

In [47]:
mode_credit['Monthly_Balance'].max()

1602.0405189622518

In [48]:
#Final dataframe
scoredf = mode_credit

# scoredf.columns = [col.lower() for col in scoredf.columns]

scoredf

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Annual_Income,Type_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,...,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score,num_credit_cards,num_of_loan,num_bank_accounts,intrest_rate,occupation,monthly_inhand_salary
0,0x1602,CUS0xd40,January,Aaron Maashoh,23.0,821-00-0265,19114.12,"Auto Loan, Credit-Builder Loan, Personal Loan,...",3,7.0,...,80.41529543900253,HighspentSmallvaluepayments,312.494089,Good,4,4,3,3,Scientist,1824.843333
2,0x1604,CUS0xd40,March,Aaron Maashoh,23.0,821-00-0265,19114.12,"Auto Loan, Credit-Builder Loan, Personal Loan,...",3,7.0,...,81.699521264648,LowspentMediumvaluepayments,331.209863,Good,4,4,3,3,Scientist,1824.843333
3,0x1605,CUS0xd40,April,Aaron Maashoh,23.0,821-00-0265,19114.12,"Auto Loan, Credit-Builder Loan, Personal Loan,...",5,4.0,...,199.4580743910713,LowspentSmallvaluepayments,223.451310,Good,4,4,3,3,Scientist,1824.843333
5,0x1607,CUS0xd40,June,Aaron Maashoh,23.0,821-00-0265,19114.12,"Auto Loan, Credit-Builder Loan, Personal Loan,...",8,4.0,...,62.430172331195294,!@9#%8,340.479212,Good,4,4,3,3,Scientist,1824.843333
6,0x1608,CUS0xd40,July,Aaron Maashoh,23.0,821-00-0265,19114.12,"Auto Loan, Credit-Builder Loan, Personal Loan,...",3,8.0,...,178.3440674122349,LowspentSmallvaluepayments,244.565317,Good,4,4,3,3,Scientist,1824.843333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99994,0x25fe8,CUS0x942c,March,Nicks,25.0,078-73-5990,39628.99,"Auto Loan, and Student Loan",20,6.0,...,140.58140274528395,HighspentMediumvaluepayments,410.256158,Poor,6,2,4,7,Mechanic,3359.415833
99995,0x25fe9,CUS0x942c,April,Nicks,25.0,078-73-5990,39628.99,"Auto Loan, and Student Loan",23,7.0,...,60.97133255718485,HighspentLargevaluepayments,479.866228,Poor,6,2,4,7,Mechanic,3359.415833
99996,0x25fea,CUS0x942c,May,Nicks,25.0,078-73-5990,39628.99,"Auto Loan, and Student Loan",18,7.0,...,54.18595028760385,HighspentMediumvaluepayments,496.651610,Poor,6,2,4,7,Mechanic,3359.415833
99997,0x25feb,CUS0x942c,June,Nicks,25.0,078-73-5990,39628.99,"Auto Loan, and Student Loan",27,6.0,...,24.02847744864441,HighspentLargevaluepayments,516.809083,Poor,6,2,4,7,Mechanic,3359.415833


In [49]:
print(scoredf['occupation'].unique())

['Scientist' 'Teacher' 'Engineer' 'Entrepreneur' 'Developer' 'Lawyer'
 'MediaManager' 'Doctor' 'Journalist' 'Manager' 'Accountant' 'Musician'
 'Mechanic' 'Writer' 'Architect']


## Loggistics Reggration

In [50]:
scoredf['Changed_Credit_Limit'].fillna(0, inplace=True)
scoredf['Changed_Credit_Limit'].isnull().sum()

0

In [51]:
scoredf['Credit_Score'] = scoredf['Credit_Score'].map({'Good': 1, 'Standard': 2, 'Poor': 3})
scoredf['Credit_Score']

0        1
2        1
3        1
5        1
6        1
        ..
99994    3
99995    3
99996    3
99997    3
99999    3
Name: Credit_Score, Length: 90571, dtype: int64

In [52]:
scoredf['occupation'] = scoredf['occupation'].map({'Scientist':1, 'Teacher':2, 'Engineer':3, 'Entrepreneur':4, 'Developer':5, 'Lawyer':6,
 'MediaManager':7, 'Doctor':8, 'Journalist':9, 'Manager':10, 'Accountant':11, 'Musician':12,
 'Mechanic':13, 'Writer':14, 'Architect':15})
scoredf['occupation']

0         1
2         1
3         1
5         1
6         1
         ..
99994    13
99995    13
99996    13
99997    13
99999    13
Name: occupation, Length: 90571, dtype: int64

In [53]:
scoredf['Payment_of_Min_Amount'] = scoredf['Payment_of_Min_Amount'].map({'Yes': 1, 'No': 0 , 'NM' : 2})
scoredf['Payment_of_Min_Amount']

0        0
2        0
3        0
5        0
6        0
        ..
99994    0
99995    0
99996    0
99997    0
99999    0
Name: Payment_of_Min_Amount, Length: 90571, dtype: int64

In [54]:
scoredf['Payment_of_Min_Amount'].isnull().sum()

0

In [55]:
scoredf['occupation'].isnull().sum()

0

In [58]:
scoredf.to_csv('credit.csv')

In [None]:


y = scoredf['Credit_Score']
X = scoredf.drop('Credit_Score', axis=1)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
X_train

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Annual_Income,Type_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,...,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,num_credit_cards,num_of_loan,num_bank_accounts,intrest_rate,occupation,monthly_inhand_salary
70536,0x1b34e,CUS0x3813,January,Gerry Shihy,29.0,859-43-6338,4.461670e+04,"Credit-Builder Loan, Credit-Builder Loan, and ...",13,20.0,...,109.920548,97.23616435375564,HighspentMediumvaluepayments,443.649121,6,3,4,6,15,4008.058333
42641,0x10fdb,CUS0x861c,February,Nick Brownw,28.0,654-98-3234,5.853984e+04,"Mortgage Loan, Debt Consolidation Loan, Studen...",17,21.0,...,236.562328,131.35699265097642,HighspentMediumvaluepayments,361.512680,7,8,6,32,8,4794.320000
43931,0x11769,CUS0x470e,April,Lynch Arunab,40.0,402-32-6834,1.572351e+04,"Home Equity Loan, and Student Loan",33,19.0,...,20.898830,95.51756186104114,LowspentSmallvaluepayments,290.912900,7,2,6,8,7,1173.292917
76887,0x1d881,CUS0xa0a6,August,Stellaa,54.0,823-73-2567,3.249566e+04,Not Specified,10,18.0,...,23.965704,,LowspentMediumvaluepayments,254.764697,7,1,3,18,14,2594.971667
50646,0x13ec0,CUS0xa7a3,July,Herbertm,21.0,195-25-1423,5.367294e+04,"Not Specified, Debt Consolidation Loan, Person...",27,20.0,...,220.765778,182.98861245609245,!@9#%8,305.220110,5,6,9,27,3,4589.745000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6907,0x3e79,CUS0x50af,April,Nickl,48.0,325-47-4777,2.731197e+04,"Auto Loan, Credit-Builder Loan, Credit-Builder...",28,22.0,...,88.738054,105.2136368682481,LowspentMediumvaluepayments,320.948059,3,4,5,20,5,2348.997500
60532,0x178ae,CUS0xc180,May,Poornima Guptaq,38.0,430-27-3043,1.574698e+07,"Home Equity Loan, Payday Loan, and Not Specified",43,10.0,...,161.483691,10000,HighspentLargevaluepayments,692.089263,7,3,7,28,14,6780.676667
84749,0x20693,CUS0x658e,June,Kerberp,22.0,148-78-7574,4.688049e+04,"Personal Loan, Home Equity Loan, Payday Loan, ...",44,18.0,...,106.728462,10000,HighspentLargevaluepayments,429.804873,7,5,6,23,8,3790.707500
957,0x1b9b,CUS0x567b,June,Marilynj,54.0,542-44-1751,8.039686e+04,,8,11.0,...,0.000000,422.5870728867617,LowspentMediumvaluepayments,519.786760,7,0,5,9,10,6623.738333


In [None]:
X_train = np.array(X_train[['monthly_inhand_salary'  , 'Age' , 'intrest_rate' , 'num_of_loan' , 'num_bank_accounts' , 'occupation' , 'Monthly_Balance' , 'Delay_from_due_date' , 'Num_of_Delayed_Payment' , 'Amount_invested_monthly' , 'Total_EMI_per_month' , 'Payment_of_Min_Amount']])


In [57]:
clf = LogisticRegression()
clf.fit(X_train, y)

ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values