Importing required libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import MinMaxScaler

Loading datasets

In [2]:
train_data = pd.read_csv('/content/ML_Project/train_project.csv')
train_data

Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,DRIRC89L0T,18,137576,209136,846,26,2,10.47,60,0.81,High School,Self-employed,Single,Yes,No,Business,No,0
1,TS0FIUNHNU,47,57194,5970,748,30,2,19.72,36,0.73,High School,Unemployed,Divorced,No,Yes,Education,No,0
2,I0YR284A1V,26,84328,95065,453,7,2,24.25,12,0.45,Master's,Self-employed,Married,No,No,Other,Yes,0
3,WB1T7NQV8A,53,49795,229582,533,107,3,14.44,60,0.17,Bachelor's,Self-employed,Single,Yes,No,Auto,Yes,1
4,J6GU9M4G1Z,49,115450,22072,840,0,4,24.48,12,0.11,Bachelor's,Part-time,Single,No,Yes,Education,Yes,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204272,KYS1HKNGGE,40,116623,161673,651,79,2,23.44,12,0.87,Bachelor's,Part-time,Divorced,No,No,Home,Yes,0
204273,5MAOH3AOZO,67,62958,189499,460,77,3,9.29,36,0.11,Bachelor's,Self-employed,Single,No,No,Business,Yes,0
204274,5Y9Z6NW29X,62,34372,59645,524,94,3,9.72,60,0.24,PhD,Full-time,Single,Yes,No,Auto,No,0
204275,O51974F566,44,146262,198454,489,7,4,4.31,48,0.30,High School,Self-employed,Married,Yes,No,Home,No,0


In [3]:
test_data = pd.read_csv('/content/ML_Project/test_project.csv')
undropped_test_df = test_data
test_data

Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner
0,CKV34LU7V7,55,112656,92393,581,113,2,23.54,36,0.15,PhD,Self-employed,Single,Yes,Yes,Home,No
1,62KTYNH93J,56,91569,131575,641,54,1,15.19,12,0.43,High School,Part-time,Divorced,Yes,Yes,Education,Yes
2,JGFUSOIUH7,26,78169,75417,569,105,3,18.02,12,0.29,Master's,Part-time,Married,Yes,Yes,Education,Yes
3,4538THBHOX,26,63033,10804,326,118,1,14.71,24,0.41,High School,Part-time,Single,No,No,Business,Yes
4,DXLNA06JHR,24,29665,21182,662,102,3,15.02,60,0.69,PhD,Unemployed,Single,No,Yes,Business,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51065,DQRTA8KWGC,51,99473,170353,628,24,1,17.03,12,0.46,PhD,Self-employed,Divorced,Yes,Yes,Auto,Yes
51066,W0FDMPACG3,29,42016,111314,371,51,4,7.10,36,0.50,PhD,Self-employed,Married,No,No,Other,No
51067,MA0F4U8ORY,67,88507,142666,731,51,1,22.89,48,0.79,Bachelor's,Part-time,Divorced,No,No,Education,No
51068,6QUH04P7EJ,42,116649,190938,488,6,1,10.83,60,0.32,Bachelor's,Full-time,Married,No,Yes,Other,Yes


Preprocessing using One Hot Encoder

In [4]:
# Dropping the LoanID column since it shouldn't be used for making predictions
train_data = train_data.drop(['LoanID'], axis=1)
test_data = test_data.drop(['LoanID'], axis=1)

# Identify categorical and numerical columns
categorical_columns_train = train_data.select_dtypes(include=['object']).columns
numerical_columns_train = train_data.select_dtypes(include=['int64', 'float64']).columns

categorical_columns_test = test_data.select_dtypes(include=['object']).columns
numerical_columns_test = test_data.select_dtypes(include=['int64', 'float64']).columns

# Initialize the encoder
encoder = OneHotEncoder(sparse_output=False)

# Fit and transform only the categorical columns for train data
categorical_encoded_train = encoder.fit_transform(train_data[categorical_columns_train])

# Transform the categorical columns for test data using the same encoder
categorical_encoded_test = encoder.transform(test_data[categorical_columns_train])

# Create DataFrame with encoded categorical variables
encoded_categorical_traindf = pd.DataFrame(
    categorical_encoded_train,
    columns=encoder.get_feature_names_out(categorical_columns_train)
)
encoded_categorical_testdf = pd.DataFrame(
    categorical_encoded_test,
    columns=encoder.get_feature_names_out(categorical_columns_train)
)

# Combine with numerical columns (ensure we don't include 'Default' in test_df)
encoded_train_df = pd.concat([
    encoded_categorical_traindf,
    train_data[numerical_columns_train].reset_index(drop=True)
], axis=1)

encoded_test_df = pd.concat([
    encoded_categorical_testdf,
    test_data[numerical_columns_test].reset_index(drop=True)
], axis=1)

# Separate features and target variable for training
X = encoded_train_df.drop('Default', axis=1)
y = train_data['Default']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize the scaler
scaler = MinMaxScaler()

# Scale the training and testing features
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Implementing SVM

In [5]:
svm = LinearSVC()
svm.fit(X_train_scaled, y_train)
y_pred = svm.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))

Accuracy: 0.8837135304484042
              precision    recall  f1-score   support

           0       0.88      1.00      0.94     36105
           1       0.00      0.00      0.00      4751

    accuracy                           0.88     40856
   macro avg       0.44      0.50      0.47     40856
weighted avg       0.78      0.88      0.83     40856



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Creating Submission File

In [7]:
best_predictions = svm.predict(scaler.transform(encoded_test_df))
# Generate the submission file
def generate_submission(predictions, df):
    submission = pd.DataFrame({'LoanID': df['LoanID'], 'Default': predictions})
    submission.to_csv("LendOrLose_submission.csv", index=False)

# Create the submission file
generate_submission(best_predictions, undropped_test_df)
print("Submission file created successfully!")

Submission file created successfully!
