In [None]:
# Swapnil Saha Shawon (2022533042)
# Tamanna Rahman (2021450642)
# Syeda Mashiat Tabassum (2031356642)

## **Credit Dataset**

In [None]:
#include libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#load data and checking contents
data = pd.read_csv("DT-Credit.csv")
print(data)

#check for null values
print("\nChecking NULL values:\n",data.isnull().sum())

      Income  Limit  Rating  Cards  Age  Education  Own Student Married  \
0     14.891   3606     283      2   34         11   No      No     Yes   
1    106.025   6645     483      3   82         15  Yes     Yes     Yes   
2    104.593   7075     514      4   71         11   No      No      No   
3    148.924   9504     681      3   36         11  Yes      No      No   
4     55.882   4897     357      2   68         16   No      No     Yes   
..       ...    ...     ...    ...  ...        ...  ...     ...     ...   
395   12.096   4100     307      3   32         13   No      No     Yes   
396   13.364   3838     296      5   65         17   No      No      No   
397   57.872   4171     321      5   67         12  Yes      No     Yes   
398   37.728   2525     192      1   44         13   No      No     Yes   
399   18.701   5524     415      5   64          7  Yes      No      No   

    Region  Balance  
0    South      333  
1     West      903  
2     West      580  
3     West 

In [None]:
#preprocessing data
categorical_cols = data.select_dtypes(include=['object']).columns.tolist()
print("Categorical Columns:", categorical_cols)
print("\n")

data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)

print(data)

Categorical Columns: ['Own', 'Student', 'Married', 'Region']


      Income  Limit  Rating  Cards  Age  Education  Balance  Own_Yes  \
0     14.891   3606     283      2   34         11      333    False   
1    106.025   6645     483      3   82         15      903     True   
2    104.593   7075     514      4   71         11      580    False   
3    148.924   9504     681      3   36         11      964     True   
4     55.882   4897     357      2   68         16      331    False   
..       ...    ...     ...    ...  ...        ...      ...      ...   
395   12.096   4100     307      3   32         13      560    False   
396   13.364   3838     296      5   65         17      480    False   
397   57.872   4171     321      5   67         12      138     True   
398   37.728   2525     192      1   44         13        0    False   
399   18.701   5524     415      5   64          7      966     True   

     Student_Yes  Married_Yes  Region_South  Region_West  
0          Fa

In [None]:
#selecting target variable and features
X = data.drop(['Balance'], axis = 1)
y = data['Balance']

#splitting data for training, validating and testing
X_train = X.iloc[:280]
X_validation = X.iloc[280:340]
X_test = X.iloc[340:]
print(X_validation)
y_train = y.iloc[:280]
y_validation = y.iloc[280:340]
y_test = y.iloc[340:]

      Income  Limit  Rating  Cards  Age  Education  Own_Yes  Student_Yes  \
280   53.401   5319     377      3   35         12     True        False   
281   36.142   1852     183      3   33         13     True        False   
282   63.534   8100     581      2   50         17     True        False   
283   49.927   6396     485      3   75         17     True        False   
284   14.711   2047     167      2   67          6    False        False   
285   18.967   1626     156      2   41         11     True        False   
286   18.036   1552     142      2   48         15     True        False   
287   60.449   3098     272      4   69          8    False        False   
288   16.711   5274     387      3   42         16     True        False   
289   10.852   3907     296      2   30          9    False        False   
290   26.370   3235     268      5   78         11    False        False   
291   24.088   3665     287      4   56         13     True        False   
292   51.532

### **Evaluation Metrics**

In [None]:
def mean_sq_error(y_true, y_pred):
  if len(y_true) != len(y_pred):
    raise ValueError("Lengths of y_true and y_pred must be the same.")

  # Calculate the squared differences between true and predicted values
  squared_errors = [(true - pred) ** 2 for true, pred in zip(y_true, y_pred)]

  # Calculate the mean of squared errors
  mse = sum(squared_errors) / len(y_true)

  return mse

### **Support vector regression (SVR)**

In [None]:
#Training the dataset
from sklearn.svm import SVR

#Define the Support Vector Regressors
svr_rbf = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1)
svr_lin = SVR(kernel='linear', C=100, epsilon=.1)
svr_poly = SVR(kernel='poly', C=100, degree=3, epsilon=.1, coef0=1)

svr_rbf_trained = svr_rbf.fit(X_train, y_train)
svr_lin_trained = svr_lin.fit(X_train, y_train)
svr_poly_trained = svr_poly.fit(X_train, y_train)

In [None]:
#Validating the model using training set
y_rbf_pred = svr_rbf_trained.predict(X_train)
y_lin_pred = svr_lin_trained.predict(X_train)
y_poly_pred = svr_poly_trained.predict(X_train)

mse = mean_sq_error(y_train, y_rbf_pred)
print('Rbf SVR Mean Squared Error: ', mse)
mse = mean_sq_error(y_train, y_lin_pred)
print('Lin SVR Mean Squared Error: ', mse)
mse = mean_sq_error(y_train, y_poly_pred)
print('Poly SVR Mean Squared Error: ', mse)

Rbf SVR Mean Squared Error:  151477.81262450502
Lin SVR Mean Squared Error:  94105.97791426926
Poly SVR Mean Squared Error:  57587.33518730447


In [None]:
#Validating the model using validation set
y_rbf_pred = svr_rbf_trained.predict(X_validation)
y_lin_pred = svr_lin_trained.predict(X_validation)
y_poly_pred = svr_poly_trained.predict(X_validation)

mse = mean_sq_error(y_validation, y_rbf_pred)
print('Rbf SVR Mean Squared Error: ', mse)
mse = mean_sq_error(y_validation, y_lin_pred)
print('Lin SVR Mean Squared Error: ', mse)
mse = mean_sq_error(y_validation, y_poly_pred)
print('Poly SVR Mean Squared Error: ', mse)

Rbf SVR Mean Squared Error:  225802.8137528277
Lin SVR Mean Squared Error:  91351.20300772093
Poly SVR Mean Squared Error:  38233.0412158933


In [None]:
#Using the best model to evaluate the test set
y_poly_pred = svr_poly_trained.predict(X_test)

mse = mean_sq_error(y_test, y_poly_pred)
print('SVR Mean Squared Error: ', mse)

SVR Mean Squared Error:  49040.79741914224
