In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv("Credit Score Data/train.csv")
df.head()

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,5634,3392,1,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,26.82262,265.0,No,49.574949,21.46538,High_spent_Small_value_payments,312.494089,Good
1,5635,3392,2,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,31.94496,266.0,No,49.574949,21.46538,Low_spent_Large_value_payments,284.629162,Good
2,5636,3392,3,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,28.609352,267.0,No,49.574949,21.46538,Low_spent_Medium_value_payments,331.209863,Good
3,5637,3392,4,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,31.377862,268.0,No,49.574949,21.46538,Low_spent_Small_value_payments,223.45131,Good
4,5638,3392,5,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,24.797347,269.0,No,49.574949,21.46538,High_spent_Medium_value_payments,341.489231,Good


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 28 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   ID                        100000 non-null  int64  
 1   Customer_ID               100000 non-null  int64  
 2   Month                     100000 non-null  int64  
 3   Name                      100000 non-null  object 
 4   Age                       100000 non-null  float64
 5   SSN                       100000 non-null  float64
 6   Occupation                100000 non-null  object 
 7   Annual_Income             100000 non-null  float64
 8   Monthly_Inhand_Salary     100000 non-null  float64
 9   Num_Bank_Accounts         100000 non-null  float64
 10  Num_Credit_Card           100000 non-null  float64
 11  Interest_Rate             100000 non-null  float64
 12  Num_of_Loan               100000 non-null  float64
 13  Type_of_Loan              100000 non-null  ob

In [4]:
X = df.drop('Credit_Score',axis=1)
y = df['Credit_Score']

In [5]:
y.shape

(100000,)

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.2,random_state=13)

In [8]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((80000, 27), (20000, 27), (80000,), (20000,))

In [9]:
numeric_features = list(df.select_dtypes(exclude='object').columns)
numeric_features

['ID',
 'Customer_ID',
 'Month',
 'Age',
 'SSN',
 'Annual_Income',
 'Monthly_Inhand_Salary',
 'Num_Bank_Accounts',
 'Num_Credit_Card',
 'Interest_Rate',
 'Num_of_Loan',
 'Delay_from_due_date',
 'Num_of_Delayed_Payment',
 'Changed_Credit_Limit',
 'Num_Credit_Inquiries',
 'Outstanding_Debt',
 'Credit_Utilization_Ratio',
 'Credit_History_Age',
 'Total_EMI_per_month',
 'Amount_invested_monthly',
 'Monthly_Balance']

In [10]:
numeric_features.remove('ID')
numeric_features.remove('Customer_ID')
numeric_features.remove('Month')
numeric_features.remove('SSN')
numeric_features.remove('Credit_Utilization_Ratio')
numeric_features

['Age',
 'Annual_Income',
 'Monthly_Inhand_Salary',
 'Num_Bank_Accounts',
 'Num_Credit_Card',
 'Interest_Rate',
 'Num_of_Loan',
 'Delay_from_due_date',
 'Num_of_Delayed_Payment',
 'Changed_Credit_Limit',
 'Num_Credit_Inquiries',
 'Outstanding_Debt',
 'Credit_History_Age',
 'Total_EMI_per_month',
 'Amount_invested_monthly',
 'Monthly_Balance']

In [11]:
cat_features = list(df.select_dtypes(include='object').columns)
cat_features

['Name',
 'Occupation',
 'Type_of_Loan',
 'Credit_Mix',
 'Payment_of_Min_Amount',
 'Payment_Behaviour',
 'Credit_Score']

In [12]:
cat_features.remove('Name')
cat_features.remove('Occupation')
cat_features.remove('Payment_Behaviour')
cat_features.remove('Credit_Score')
cat_features

['Type_of_Loan', 'Credit_Mix', 'Payment_of_Min_Amount']

In [13]:
num_transformer = Pipeline([("imputer_n",SimpleImputer(strategy='median')),
                            ('scaler', MinMaxScaler())
                           ])
cat_transformer = Pipeline([("imputer_c",SimpleImputer(strategy='most_frequent')),
                            ('encoder', OneHotEncoder())
                           ])

In [14]:
preprocessor = ColumnTransformer([('num_pipe', num_transformer, numeric_features),
                                  ('cat_pipe', cat_transformer, cat_features)])

### Using Logistic Regression

In [None]:
final_pipe = Pipeline([('preprocess_pipe', preprocessor),
                      ('model', LogisticRegression(solver='lbfgs', max_iter=1000))])

In [None]:
final_pipe.fit(X_train, y_train)

In [None]:
final_pipe.score(X_train, y_train)

In [None]:
final_pipe.predict(X_val)

In [None]:
final_pipe.score(X_val, y_val)

## Using SVM

In [None]:
from sklearn import svm

In [None]:
final_pipe = Pipeline([('preprocess_pipe', preprocessor),
                      ('model', svm.SVC())])

In [None]:
final_pipe.fit(X_train, y_train)

In [None]:
final_pipe.score(X_train, y_train)

In [None]:
final_pipe.predict(X_val)

In [None]:
final_pipe.score(X_val, y_val)

## Using RandomForest

In [15]:
from sklearn.ensemble import RandomForestClassifier

In [16]:
final_pipe = Pipeline([('preprocess_pipe', preprocessor),
                      ('model', RandomForestClassifier())])

In [17]:
final_pipe.fit(X_train, y_train)

Pipeline(steps=[('preprocess_pipe',
                 ColumnTransformer(transformers=[('num_pipe',
                                                  Pipeline(steps=[('imputer_n',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   MinMaxScaler())]),
                                                  ['Age', 'Annual_Income',
                                                   'Monthly_Inhand_Salary',
                                                   'Num_Bank_Accounts',
                                                   'Num_Credit_Card',
                                                   'Interest_Rate',
                                                   'Num_of_Loan',
                                                   'Delay_from_due_date',
                                                   'Num_

In [18]:
final_pipe.score(X_train, y_train)

0.99995

In [19]:
final_pipe.predict(X_val)

array(['Standard', 'Poor', 'Standard', ..., 'Standard', 'Standard',
       'Standard'], dtype=object)

In [20]:
final_pipe.score(X_val, y_val)

0.8142