In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import pandas as pd

In [9]:
data = pd.read_csv('C:/Users/chowd/OneDrive/Desktop/Datasets/Credit.csv')

In [10]:
data.head(3)

Unnamed: 0.1,Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Gender,Student,Married,Ethnicity,Balance
0,1,14.891,3606,283,2,34,11,Male,No,Yes,Caucasian,333
1,2,106.025,6645,483,3,82,15,Female,Yes,Yes,Asian,903
2,3,104.593,7075,514,4,71,11,Male,No,No,Asian,580


In [13]:
# Creating the KNN MODEL
# Preprocessing
# Remove non-numeric columns if any
numeric_data = data.select_dtypes(include=['float64','int64'])

# Handle missing values if any
numeric_data.fillna(0,inplace= True) # Filling missing values with 0, you might choose another strategy

# Split the dataset into features(X) and target variable(y)
X = numeric_data.drop(columns=['Balance']) # Features
y = numeric_data['Balance'] # Target variable

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/Test Split
X_train,X_test,y_train,y_test = train_test_split(X_scaled,y, test_size = 0.2, random_state= 42)

# Model training:
k=5 # choosing the number of neighbors
knn_regressor = KNeighborsRegressor(n_neighbors=k)
knn_regressor.fit(X_train,y_train)

# Model evaluation
y_pred = knn_regressor.predict(X_test)
mse = mean_squared_error(y_test,y_pred)
mae = mean_absolute_error(y_test,y_pred)
rmse = mean_squared_error(y_test, y_pred, squared= False)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'Root Mean Squared Error: {rmse}')


Mean Squared Error: 60491.098
Mean Absolute Error: 182.37
Root Mean Squared Error: 245.9493809709632


In [17]:
# Creating an OLS regression model using the given dataset
import statsmodels.api as sm
import pandas as pd
import numpy as np

In [25]:
data = pd.read_csv('C:/Users/chowd/OneDrive/Desktop/Datasets/Credit.csv')

In [21]:
# Preprocessing
# Remove non-numeric columns if any
numeric_data = data.select_dtypes(include=['float64','int64'])

# Handle missing values if any
numeric_data.fillna(0,inplace= True) # Filling missing values with 0, you might choose another strategy


# Split the dataset into features (X) and target variable(y)
X = numeric_data.drop(columns=['Balance']) # Features
X = sm.add_constant(X) ## Add a constant (intercept) to the model
y = numeric_data['Balance']  # Target variable

# Fit the OLS regression model
model = sm.OLS(y,X)
results = model.fit()

# Print the summary of the regression
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                Balance   R-squared:                       0.878
Model:                            OLS   Adj. R-squared:                  0.876
Method:                 Least Squares   F-statistic:                     403.9
Date:                Sat, 02 Mar 2024   Prob (F-statistic):          6.82e-175
Time:                        11:49:46   Log-Likelihood:                -2598.2
No. Observations:                 400   AIC:                             5212.
Df Residuals:                     392   BIC:                             5244.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       -478.2006     56.743     -8.427      0.0

In [24]:
# Model evaluation for OLS
X_test = sm.add_constant(X_test)
y_pred = results.predict(X_test)
mse = mean_squared_error(y_test,y_pred)
mae = mean_absolute_error(y_test,y_pred)
rmse = mean_squared_error(y_test, y_pred, squared= False)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'Root Mean Squared Error: {rmse}')

Mean Squared Error: 1171693.270570617
Mean Absolute Error: 1002.1311360413029
Root Mean Squared Error: 1082.447814248159
