In [1]:
import pandas as pd

In [65]:
df = pd.read_csv('creditdata/train.csv')

  df = pd.read_csv('creditdata/train.csv')


In [92]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71831 entries, 0 to 99998
Data columns (total 39 columns):
 #   Column                                              Non-Null Count  Dtype  
---  ------                                              --------------  -----  
 0   ID                                                  71831 non-null  object 
 1   Month                                               71831 non-null  int64  
 2   Age                                                 68259 non-null  float64
 3   Annual_Income                                       71831 non-null  float64
 4   Num_Bank_Accounts                                   71831 non-null  int64  
 5   Num_Credit_Card                                     71831 non-null  int64  
 6   Interest_Rate                                       71831 non-null  int64  
 7   Num_of_Loan                                         71831 non-null  float64
 8   Delay_from_due_date                                 71831 non-null  int64  


## Droping Irrelevant Columns

1. SSN: Social Security Numbers may not be relevant in India.
2. Customer_ID: Likely not needed as a feature for modeling.
3. Name: This does not provide meaningful information for a credit score.
4. Monthly_Inhand_Salary: Since this data is missing for ~15% of the dataset, it may not be reliable.
5. Changed_Credit_Limit: Credit limit changes might not be as relevant in rural areas, where credit access can be limited.
6. Amount_invested_monthly: Many rural customers might not have formal investment portfolios, so this feature may not be reliable.
7. Credit_History_Age: Rural customers may have limited or no credit history, making this less predictive.
8. Num_Credit_Inquiries: Credit inquiries may be less frequent or tracked in rural settings.

In [67]:
df = df.drop(columns=['SSN', 'Customer_ID', 'Name', 'Monthly_Inhand_Salary', 'Changed_Credit_Limit', 'Amount_invested_monthly', 'Credit_History_Age', 'Num_Credit_Inquiries'])

# Cleaning Data

In [95]:
print(df.isnull().sum())

ID                                                    0
Month                                                 0
Age                                                   0
Annual_Income                                         0
Num_Bank_Accounts                                     0
Num_Credit_Card                                       0
Interest_Rate                                         0
Num_of_Loan                                           0
Delay_from_due_date                                   0
Num_of_Delayed_Payment                                0
Credit_Mix                                            0
Outstanding_Debt                                      0
Credit_Utilization_Ratio                              0
Payment_of_Min_Amount                                 0
Total_EMI_per_month                                   0
Monthly_Balance                                       0
Credit_Score                                          0
Occupation_Architect                            

In [69]:
# Fill missing values for 'Type_of_Loan' with 'Unknown'
df['Type_of_Loan'].fillna('Unknown', inplace=True)

In [70]:
# Convert 'Num_of_Delayed_Payment' to numeric, forcing errors to NaN
df['Num_of_Delayed_Payment'] = pd.to_numeric(df['Num_of_Delayed_Payment'], errors='coerce')

# Now fill the missing values with the median
df['Num_of_Delayed_Payment'].fillna(df['Num_of_Delayed_Payment'].median(), inplace=True)

In [71]:
# Convert 'Monthly_Balance' to numeric, forcing errors to NaN (if necessary)
df['Monthly_Balance'] = pd.to_numeric(df['Monthly_Balance'], errors='coerce')

# Fill missing values with the median
df['Monthly_Balance'].fillna(df['Monthly_Balance'].median(), inplace=True)

In [72]:
# Check data types of all columns
print(df.dtypes)

ID                           object
Month                        object
Age                          object
Occupation                   object
Annual_Income                object
Num_Bank_Accounts             int64
Num_Credit_Card               int64
Interest_Rate                 int64
Num_of_Loan                  object
Type_of_Loan                 object
Delay_from_due_date           int64
Num_of_Delayed_Payment      float64
Credit_Mix                   object
Outstanding_Debt             object
Credit_Utilization_Ratio    float64
Payment_of_Min_Amount        object
Total_EMI_per_month         float64
Payment_Behaviour            object
Monthly_Balance             float64
Credit_Score                 object
dtype: object


In [73]:
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
df['Num_of_Loan'] = pd.to_numeric(df['Num_of_Loan'], errors='coerce')
df['Annual_Income'] = pd.to_numeric(df['Annual_Income'], errors='coerce')
df['Outstanding_Debt'] = pd.to_numeric(df['Outstanding_Debt'], errors='coerce')

In [94]:
df['Age'].fillna(df['Age'].median(), inplace=True)

In [74]:
# Check if conversion worked
print(df['Month'].head())

0     January
1    February
2       March
3       April
4         May
Name: Month, dtype: object


In [75]:
# Mapping month names to numerical values
month_mapping = {
    'January': 1,
    'February': 2,
    'March': 3,
    'April': 4,
    'May': 5,
    'June': 6,
    'July': 7,
    'August': 8,
    'September': 9,
    'October': 10,
    'November': 11,
    'December': 12
}

# Apply the mapping to the 'Month' column
df['Month'] = df['Month'].map(month_mapping)

# Check the result
print(df['Month'].head())

0    1
1    2
2    3
3    4
4    5
Name: Month, dtype: int64


## Handling Outliers

In [76]:
# Calculate IQR for 'Annual_Income'
Q1 = df['Annual_Income'].quantile(0.25)
Q3 = df['Annual_Income'].quantile(0.75)
IQR = Q3 - Q1

# Define bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out rows with outliers
df = df[(df['Annual_Income'] >= lower_bound) & (df['Annual_Income'] <= upper_bound)]

# Check the result
print(df['Annual_Income'].describe())

count     90458.000000
mean      48370.116233
std       35191.331168
min        7005.930000
25%       19197.300000
50%       36379.240000
75%       69857.985000
max      152947.120000
Name: Annual_Income, dtype: float64


In [77]:
# List of numeric columns to check for outliers
numeric_columns = ['Outstanding_Debt', 'Total_EMI_per_month', 'Num_of_Loan']

for col in numeric_columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    
    # Define bounds for outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Filter out rows with outliers
    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

# Check the updated dataframe
print(df.describe())

              Month           Age  Annual_Income  Num_Bank_Accounts  \
count  71831.000000  68259.000000   71831.000000       71831.000000   
mean       4.479514    112.616607   47564.686601          17.084796   
std        2.290930    695.045554   34736.213504         117.724122   
min        1.000000   -500.000000    7005.930000          -1.000000   
25%        2.000000     25.000000   19204.590000           3.000000   
50%        4.000000     33.000000   35618.390000           5.000000   
75%        6.000000     42.000000   68289.320000           7.000000   
max        8.000000   8698.000000  152947.120000        1798.000000   

       Num_Credit_Card  Interest_Rate   Num_of_Loan  Delay_from_due_date  \
count     71831.000000   71831.000000  71831.000000         71831.000000   
mean         22.799126      73.663363      3.307165            20.283708   
std         130.371672     476.283251      2.310514            14.288484   
min           0.000000       1.000000      0.000000     

In [78]:
df.head()

Unnamed: 0,ID,Month,Age,Occupation,Annual_Income,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Payment_of_Min_Amount,Total_EMI_per_month,Payment_Behaviour,Monthly_Balance,Credit_Score
0,0x1602,1,23.0,Scientist,19114.12,3,4,3,4.0,"Auto Loan, Credit-Builder Loan, Personal Loan,...",3,7.0,_,809.98,26.82262,No,49.574949,High_spent_Small_value_payments,312.494089,Good
1,0x1603,2,23.0,Scientist,19114.12,3,4,3,4.0,"Auto Loan, Credit-Builder Loan, Personal Loan,...",-1,14.0,Good,809.98,31.94496,No,49.574949,Low_spent_Large_value_payments,284.629162,Good
2,0x1604,3,-500.0,Scientist,19114.12,3,4,3,4.0,"Auto Loan, Credit-Builder Loan, Personal Loan,...",3,7.0,Good,809.98,28.609352,No,49.574949,Low_spent_Medium_value_payments,331.209863,Good
3,0x1605,4,23.0,Scientist,19114.12,3,4,3,4.0,"Auto Loan, Credit-Builder Loan, Personal Loan,...",5,4.0,Good,809.98,31.377862,No,49.574949,Low_spent_Small_value_payments,223.45131,Good
4,0x1606,5,23.0,Scientist,19114.12,3,4,3,4.0,"Auto Loan, Credit-Builder Loan, Personal Loan,...",6,14.0,Good,809.98,24.797347,No,49.574949,High_spent_Medium_value_payments,341.489231,Good


## Encoding Categorical Variables

In [79]:
# Apply One-Hot Encoding for 'Occupation' and 'Payment_Behaviour'
df = pd.get_dummies(df, columns=['Occupation', 'Payment_Behaviour'], drop_first=True)

# Check the updated dataframe
print(df.head())

       ID  Month    Age  Annual_Income  Num_Bank_Accounts  Num_Credit_Card  \
0  0x1602      1   23.0       19114.12                  3                4   
1  0x1603      2   23.0       19114.12                  3                4   
2  0x1604      3 -500.0       19114.12                  3                4   
3  0x1605      4   23.0       19114.12                  3                4   
4  0x1606      5   23.0       19114.12                  3                4   

   Interest_Rate  Num_of_Loan  \
0              3          4.0   
1              3          4.0   
2              3          4.0   
3              3          4.0   
4              3          4.0   

                                        Type_of_Loan  Delay_from_due_date  \
0  Auto Loan, Credit-Builder Loan, Personal Loan,...                    3   
1  Auto Loan, Credit-Builder Loan, Personal Loan,...                   -1   
2  Auto Loan, Credit-Builder Loan, Personal Loan,...                    3   
3  Auto Loan, Credit-Bui

In [80]:
# Map 'Credit_Mix' to numerical values
credit_mix_mapping = {
    'Bad': 1,
    'Standard': 2,
    'Good': 3
}

df['Credit_Mix'] = df['Credit_Mix'].map(credit_mix_mapping)

# Check the updated dataframe
print(df['Credit_Mix'].head())

0    NaN
1    3.0
2    3.0
3    3.0
4    3.0
Name: Credit_Mix, dtype: float64


In [81]:
# Fill NaN values in 'Credit_Mix' with the most frequent value (mode)
df['Credit_Mix'].fillna(df['Credit_Mix'].mode()[0], inplace=True)

# Check the updated column
print(df['Credit_Mix'].head())

0    2.0
1    3.0
2    3.0
3    3.0
4    3.0
Name: Credit_Mix, dtype: float64


## Fixing Loan Column

In [82]:
# Count the number of loans in the 'Type_of_Loan' column
df['Num_of_Loans'] = df['Type_of_Loan'].apply(lambda x: len(str(x).split(',')))

# Drop the original 'Type_of_Loan' column
df = df.drop(columns=['Type_of_Loan'])

# Check the updated dataframe
df["Num_of_Loans"].head()

0    4
1    4
2    4
3    4
4    4
Name: Num_of_Loans, dtype: int64

## Fixing Payment of Min Amount Column

In [87]:
df['Payment_of_Min_Amount'].unique()

array(['No', 'NM', 'Yes'], dtype=object)

In [88]:
# Encode 'Yes', 'No', and 'NM' as numerical values
df['Payment_of_Min_Amount'] = df['Payment_of_Min_Amount'].replace({'Yes': 1, 'No': 0, 'NM': 2})

# Check the unique values after encoding
print(df['Payment_of_Min_Amount'].unique())

[0 2 1]


## Feature Scaling

In [83]:
from sklearn.preprocessing import MinMaxScaler

# List of numeric columns to scale
numeric_columns_to_scale = ['Age', 'Annual_Income', 'Outstanding_Debt', 'Total_EMI_per_month', 'Credit_Utilization_Ratio']

# Initialize the scaler
scaler = MinMaxScaler()

# Apply the scaler to the numeric columns
df[numeric_columns_to_scale] = scaler.fit_transform(df[numeric_columns_to_scale])

# Check the scaled data
print(df[numeric_columns_to_scale].head())

       Age  Annual_Income  Outstanding_Debt  Total_EMI_per_month  \
0  0.05686       0.082966          0.198885             0.146104   
1  0.05686       0.082966          0.198885             0.146104   
2  0.00000       0.082966          0.198885             0.146104   
3  0.05686       0.082966          0.198885             0.146104   
4  0.05686       0.082966          0.198885             0.146104   

   Credit_Utilization_Ratio  
0                  0.240765  
1                  0.421528  
2                  0.303817  
3                  0.401516  
4                  0.169294  


In [96]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X = df.drop(columns=['Credit_Score', 'ID'])  # Drop the target and any irrelevant columns
y = df['Credit_Score']  # Target variable

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shapes of the resulting datasets
print(f"Training data shape: {X_train.shape}, Testing data shape: {X_test.shape}")

Training data shape: (57464, 37), Testing data shape: (14367, 37)


In [97]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Initialize the model
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7845757639033897
Classification Report:
               precision    recall  f1-score   support

        Good       0.70      0.69      0.69      2569
        Poor       0.79      0.79      0.79      4116
    Standard       0.81      0.82      0.81      7682

    accuracy                           0.78     14367
   macro avg       0.77      0.76      0.77     14367
weighted avg       0.78      0.78      0.78     14367



In [98]:
import joblib

# Save the model to a file
joblib.dump(model, 'credit_score_model.pkl')

['credit_score_model.pkl']

In [99]:
# List the columns used for training
required_columns = X_train.columns
print(required_columns)

Index(['Month', 'Age', 'Annual_Income', 'Num_Bank_Accounts', 'Num_Credit_Card',
       'Interest_Rate', 'Num_of_Loan', 'Delay_from_due_date',
       'Num_of_Delayed_Payment', 'Credit_Mix', 'Outstanding_Debt',
       'Credit_Utilization_Ratio', 'Payment_of_Min_Amount',
       'Total_EMI_per_month', 'Monthly_Balance', 'Occupation_Architect',
       'Occupation_Developer', 'Occupation_Doctor', 'Occupation_Engineer',
       'Occupation_Entrepreneur', 'Occupation_Journalist', 'Occupation_Lawyer',
       'Occupation_Manager', 'Occupation_Mechanic', 'Occupation_Media_Manager',
       'Occupation_Musician', 'Occupation_Scientist', 'Occupation_Teacher',
       'Occupation_Writer', 'Occupation________',
       'Payment_Behaviour_High_spent_Large_value_payments',
       'Payment_Behaviour_High_spent_Medium_value_payments',
       'Payment_Behaviour_High_spent_Small_value_payments',
       'Payment_Behaviour_Low_spent_Large_value_payments',
       'Payment_Behaviour_Low_spent_Medium_value_payments