# Task 1

Create a new variable from trainData called “y” which takes the value = 1 if the column “loan status”
has the value “Charged Off” and 0 otherwise. All other variables provided to you other than the loan
status are features or “predictors”. Consider whether you would like to transform your variables; for
example, consider converting some of the categorical variables into a continuous variable. Identify
also the 10 most correlated and the 10 least correlated variables.

## Cleaning data

In [194]:
import pandas as pd

# Load the trainData from the CSV file
trainData = pd.read_csv('trainData.csv')
# Load the testData from the CSV file
testData = pd.read_csv('testData.csv')

#Dropping the columns which are not required
trainData = trainData.drop('id', axis=1)
trainData = trainData.drop('member_id', axis=1)
testData = testData.drop('id', axis=1)
testData = testData.drop('member_id', axis=1)
trainData = trainData.drop('mths_since_last_delinq', axis=1)
testData = testData.drop('mths_since_last_delinq', axis=1)

#Clean Data by assigning numerical values to the columns and dropping the id and member_id columns
trainData['loan_status'] = trainData['loan_status'].map({'Charged Off': 1}).fillna(0)
testData['loan_status'] = testData['loan_status'].map({'Charged Off': 1}).fillna(0)

trainData['grade'] = trainData['grade'].map({'A': 7, 'B': 6, 'C': 5, 'D': 4, 'E': 3})
testData['grade'] = testData['grade'].map({'A': 7, 'B': 6, 'C': 5, 'D': 4, 'E': 3})

trainData['emp_length'] = trainData['emp_length'].map({'< 1 year': 1, '1 year': 2, '2 years': 3, '3 years': 4, '4 years': 5, '5 years': 6, '6 years': 7, '7 years': 8, '8 years': 9, '9 years': 10, '10+ years': 11})
testData['emp_length'] = testData['emp_length'].map({'< 1 year': 1, '1 year': 2, '2 years': 3, '3 years': 4, '4 years': 5, '5 years': 6, '6 years': 7, '7 years': 8, '8 years': 9, '9 years': 10, '10+ years': 11})

trainData['home_ownership'] = trainData['home_ownership'].map({'RENT': 1, 'MORTGAGE': 2, 'OWN': 3})
testData['home_ownership'] = testData['home_ownership'].map({'RENT': 1, 'MORTGAGE': 2, 'OWN': 3})

trainData['application_type'] = trainData['application_type'].map({'INDIVIDUAL': 1, 'JOINT': 2})
testData['application_type'] = testData['application_type'].map({'INDIVIDUAL': 1, 'JOINT': 2})

#Fill the missing values with the mean of the column
trainData = trainData.fillna(trainData.mean())
testData = testData.fillna(testData.mean())

## Getting correlations

In [195]:
# Get absolute values of correlations
y = trainData['loan_status']
trainData = trainData.drop('loan_status', axis=1)
y_test = testData['loan_status']
testData = testData.drop('loan_status', axis=1)
correlations = trainData.corrwith(y).abs()

# Identify the 10 variables with the largest absolute correlations
print("Variables with largest absolute correlations")
top_10_correlations = correlations.nlargest(10)
print(top_10_correlations)
print("Variables with smallest absolute correlations")
print(correlations.nsmallest(10))

Variables with largest absolute correlations
recoveries                 0.516998
collection_recovery_fee    0.492596
total_rec_prncp            0.216278
int_rate                   0.198602
grade                      0.191265
last_pymnt_amnt            0.174919
total_pymnt                0.132429
total_pymnt_inv            0.132253
total_rec_late_fee         0.101508
inq_last_6mths             0.087960
dtype: float64
Variables with smallest absolute correlations
tot_coll_amt                  0.000444
collections_12_mths_ex_med    0.004086
emp_length                    0.006816
acc_now_delinq                0.010761
total_acc                     0.016801
delinq_2yrs                   0.017148
open_acc                      0.018571
revol_bal                     0.018616
loan_amnt                     0.020112
installment                   0.026384
dtype: float64


## Linear regression

In [175]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Define preprocessing steps
preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Fill missing values with mean
    ('scaler', StandardScaler())  # Scale features
])

# Fit and transform training data
trainData_preprocessed = preprocessor.fit_transform(trainData)

# Transform test data using the same preprocessing steps
testData_preprocessed = preprocessor.transform(testData)




In [170]:
from sklearn.linear_model import LinearRegression

# Create a Linear Regression model
model = LinearRegression()

# Fit the model on the training data
model.fit(trainData_preprocessed, y)


In [171]:
from sklearn.metrics import mean_squared_error

mse_train = mean_squared_error(y, model.predict(trainData_preprocessed))
print("Mean Squared Error (Training Data):", mse_train)


mse_test = mean_squared_error(y_test, model.predict(testData_preprocessed))
print("Mean Squared Error (Testing Data):", mse_test)


Mean Squared Error (Training Data): 0.06887227730199531
Mean Squared Error (Testing Data): 0.06966226926366734


In [172]:
top_10_variables = top_10_correlations.index

trainData_top_10 = trainData[top_10_variables]
testData_top_10 = testData[top_10_variables]

# New model
model_top_10 = LinearRegression()
model_top_10.fit(trainData_top_10, y)

# Calculate the MSE for the training data with the top 10 variables
mse_train_top_10 = mean_squared_error(y, model_top_10.predict(trainData_top_10))
print("Mean Squared Error (Training Data - Top 10 variables):", mse_train_top_10)

# Calculate the MSE for the testing data with the top 10 variables
mse_test_top_10 = mean_squared_error(y_test, model_top_10.predict(testData_top_10))
print("Mean Squared Error (Testing Data - Top 10 variables):", mse_test_top_10)


Mean Squared Error (Training Data - Top 10 variables): 0.0695415320421812
Mean Squared Error (Testing Data - Top 10 variables): 0.07034186956735192


## Ridge Regression Model

Training Data

In [176]:
import numpy as np
from sklearn.linear_model import Ridge


# Define the range of lambda values
lambda_values = np.arange(0.01, 100, 0.01)
best_mse = 100000

# Fit Ridge regression models for each lambda value and compute MSE
for alpha in lambda_values:
    ridge_model = Ridge(alpha=alpha)
    ridge_model.fit(trainData_preprocessed, y)
    y_pred = ridge_model.predict(trainData_preprocessed)
    mse = mean_squared_error(y_test, y_pred)
        
    if  mse < best_mse:
        best_alpha = alpha
        best_mse = mse

print("Best alpha (lambda):", best_alpha)
print("Corresponding MSE (lowest):", best_mse)

Best alpha (lambda): 99.99000000000001
Corresponding MSE (lowest): 0.13629419381377905


Testing data

In [177]:
import numpy as np
from sklearn.linear_model import Ridge


# Define the range of lambda values
lambda_values = np.arange(0.01, 100, 0.01)
best_mse = 100000

# Fit Ridge regression models for each lambda value and compute MSE
for alpha in lambda_values:
    ridge_model = Ridge(alpha=alpha)
    ridge_model.fit(trainData_preprocessed, y)
    y_pred = ridge_model.predict(testData_preprocessed)
    mse = mean_squared_error(y_test, y_pred)
        
    if  mse < best_mse:
        best_alpha = alpha
        best_mse = mse

print("Best alpha (lambda):", best_alpha)
print("Corresponding MSE (lowest):", best_mse)

Best alpha (lambda): 0.01
Corresponding MSE (lowest): 0.06966228502827404


Top 10 correlated values

In [178]:
# Define the range of lambda values
lambda_values = np.arange(0.01, 100, 0.01)

best_mse = 1000000

# Fit Ridge regression models for each lambda value and compute MSE
for alpha in lambda_values:
    ridge_model_10 = Ridge(alpha=alpha)
    ridge_model_10.fit(trainData_top_10, y)
    y_pred = ridge_model_10.predict(testData_top_10)
    mse = mean_squared_error(y_test, y_pred)
    
    if mse < best_mse:
        best_alpha = alpha
        best_mse = mse

print("Best lambda for top 10 correlated variables:", best_alpha)
print("Corresponding MSE (lowest):", best_mse)

Best lambda for top 10 correlated variables: 0.01
Corresponding MSE (lowest): 0.07034186956757262


## Lasso Regression 

Training data

In [179]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error

lambda_values = np.arange(0.01, 100, 0.01)

best_mse = 1000000

# Loop over the alpha values
for alpha in lambda_values:
    model = Lasso(alpha=alpha)
    model.fit(trainData_preprocessed, y)
    y_pred = model.predict(trainData_preprocessed)
    mse = mean_squared_error(y, y_pred)

    if mse < best_mse:
        best_alpha = alpha
        best_mse = mse

print("Best lambda:", best_alpha)
print("Corresponding MSE (lowest):", best_mse)

Best lambda: 0.01
Corresponding MSE (lowest): 0.07000299607801244


Testing data

In [205]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error

lambda_values = np.arange(0.01, 100, 0.01)

best_mse = 1000000

# Loop over the alpha values
for alpha in lambda_values:
    model = Lasso(alpha=alpha)
    model.fit(trainData_preprocessed, y)
    y_pred = model.predict(testData_preprocessed)
    mse = mean_squared_error(y, y_pred)

    if mse < best_mse:
        best_alpha = alpha
        best_mse = mse

print("Best lambda:", best_alpha)
print("Corresponding MSE (lowest):", best_mse)

Best lambda: 0.17
Corresponding MSE (lowest): 0.10213002321863836


Top 10 correlated values

In [198]:
lambda_values = np.arange(0.01, 100, 0.01)

# Scale the features
scaler = StandardScaler()
trainData_scaled = scaler.fit_transform(trainData_top_10)
testData_scaled = scaler.transform(testData_top_10)

best_mse_top10 = 10000000

# Loop over the alpha values
for alpha in lambda_values:
    model = Lasso(alpha=alpha)
    model.fit(trainData_scaled, y)
    y_pred = model.predict(testData_scaled)
    mse = mean_squared_error(y, y_pred)

    if mse < best_mse_top10:
        best_alpha = alpha
        best_mse_top10 = mse  # Update best_mse_top10, not best_mse

print("Best lambda:", best_alpha)
print("Corresponding MSE (lowest):", best_mse_top10)

Best lambda: 0.17
Corresponding MSE (lowest): 0.10213002321863836


## Random Forest

Training Data

In [204]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Create a Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model on the training data
model.fit(trainData_preprocessed, y)

# Predict on the training data
y_pred = model.predict(trainData_preprocessed)

# Calculate the MSE
mse = mean_squared_error(y, y_pred)

# Print the MSE
print("MSE:", mse)

MSE: 0.0027171232422246497


Test Data

In [201]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Create a Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model on the training data
model.fit(trainData, y)

# Predict on the training data
y_pred = model.predict(testData)

# Calculate the MSE
mse = mean_squared_error(y_test, y_pred)

# Print the MSE
print("MSE:", mse)

MSE: 0.18177283106335726


Top 10 correlated values

In [202]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Create a Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model on the training data
model.fit(trainData_top_10, y)

# Predict on the training data
y_pred = model.predict(testData_top_10)

# Calculate the MSE
mse = mean_squared_error(y, y_pred)

# Print the MSE
print("MSE:", mse)

MSE: 0.18506714525395274


## Neural Network

In [206]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score

model = MLPRegressor(hidden_layer_sizes=(10,10), max_iter=1000, random_state=42)

model.fit(trainData_preprocessed, y)

y_pred = model.predict(trainData_preprocessed)

# Calculate the MSE and R2 score for the training data
mse_train = mean_squared_error(y, y_pred)
r2_train = r2_score(y, y_pred)

# Print the MSE and R2 score for the training data
print("Training MSE:", mse_train)
print("Training R2 Score:", r2_train)

Training MSE: 0.03293088873685503
Training R2 Score: 0.6775591770271403


In [208]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score

model = MLPRegressor(hidden_layer_sizes=(10,10), max_iter=1000, random_state=42)

model.fit(trainData_preprocessed, y)

y_pred = model.predict(testData_preprocessed)

# Calculate the MSE and R2 score for the training data
mse_train = mean_squared_error(y_test, y_pred)
r2_train = r2_score(y_test, y_pred)

# Print the MSE and R2 score for the training data
print("Training MSE:", mse_train)
print("Training R2 Score:", r2_train)

Training MSE: 0.03395356936560214
Training R2 Score: 0.6686707437737622
