In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso , ElasticNet, RidgeCV , LassoCV , ElasticNetCV
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import ConfusionMatrixDisplay
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [3]:
data = pd.read_csv("/content/drive/MyDrive/final_dataset.csv")
data = data.drop(columns=['index'])
data.head()

Unnamed: 0,sex,age,address,famsize,Pstatus,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,0,18,0,4,0,4,4,2,2,0,4,3,4,1,1,3,4,0,11,11
1,0,17,0,4,1,1,1,1,2,0,5,3,3,1,1,3,2,9,11,11
2,0,15,0,3,1,1,1,1,2,0,4,3,2,2,3,3,6,12,13,12
3,0,15,0,4,1,4,2,1,3,0,3,2,2,1,1,5,0,14,14,14
4,0,16,0,4,1,3,3,1,2,0,4,3,2,1,2,5,0,11,13,13


In [4]:
feat = ['goout', 'freetime','studytime','Pstatus', 'famrel', 'health']
data = data[feat]

In [5]:
x = data.drop(columns=['health'])
y = data.health

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state=21)

In [7]:
def scaler_standard(x_train, x_test):
    scaler = StandardScaler()
    x_train_scaled = scaler.fit_transform(x_train)
    x_test_scaled = scaler.transform(x_test)

    return x_train_scaled, x_test_scaled
x_train_scaled, x_test_scaled = scaler_standard(x_train, x_test)

In [8]:
vif = pd.DataFrame()
vif["vif"] = [variance_inflation_factor(x_train_scaled,i) for i in range(x_train_scaled.shape[1])]
vif["Features"] = x_train.columns
#let's check the values
vif

Unnamed: 0,vif,Features
0,1.10413,goout
1,1.131388,freetime
2,1.00868,studytime
3,1.003076,Pstatus
4,1.028464,famrel


In [9]:
linear_regression = LinearRegression()
linear_regression.fit(x_train_scaled, y_train)

linear_regression_prediction = linear_regression.predict(x_test_scaled)

In [10]:
print('Linear Regression : ')
mse = mean_squared_error(y_test, linear_regression_prediction)
print('mean_squared_error : ',mse)
mae = mean_absolute_error(y_test, linear_regression_prediction)
print('mean_absolute_error : ',mae)
r2 = r2_score(y_test, linear_regression_prediction)
print('r2_score : ',r2)
Adjusted_r2 = 1 - (1-r2)*len(y_test - 1)/(len(y_test)-x_test_scaled.shape[1]-1)
print('Adjusted R2 Score value : ',Adjusted_r2)
rmse = np.sqrt(mse)
print('rmse : ',rmse)
accuracy = linear_regression.score(x_train_scaled, y_train)
print('accuracy : ',accuracy)
print('Intercept is :', linear_regression.intercept_)
print('The regression Coefficient are :', linear_regression.coef_)
# print("Accuracy : {:.5f}".format(accuracy))

Linear Regression : 
mean_squared_error :  2.2550321435863223
mean_absolute_error :  1.3354527925953152
r2_score :  0.010580629424448817
Adjusted R2 Score value :  -0.015007802573194784
rmse :  1.5016764443735282
accuracy :  0.02344907106043337
Intercept is : 3.5322128851540615
The regression Coefficient are : [-0.05237241  0.13802735 -0.0663338   0.0240483   0.13030456]


In [13]:
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor()
dtr.fit(x_train_scaled, y_train)

dtr_prediction = dtr.predict(x_test_scaled)

In [14]:
print('Decision Tree Regression : ')
dtr_mse = mean_squared_error(y_test, dtr_prediction)
print('mean_squared_error : ',dtr_mse)
dtr_mae = mean_absolute_error(y_test, dtr_prediction)
print('mean_absolute_error : ',dtr_mae)
dtr_r2 = r2_score(y_test, dtr_prediction)
print('r2_score : ',dtr_r2)
dtr_Adjusted_r2 = 1 - (1-dtr_r2)*len(y_test - 1)/(len(y_test)-x_test_scaled.shape[1]-1)
print('Adjusted R2 Score value : ',dtr_Adjusted_r2)
dtr_rmse = np.sqrt(dtr_mse)
print('rmse : ',dtr_rmse)
dtr_accuracy = dtr.score(x_train_scaled, y_train)
print('accuracy : ',dtr_accuracy)

Decision Tree Regression : 
mean_squared_error :  2.1379057765346396
mean_absolute_error :  1.096558792171743
r2_score :  0.061971070441300125
Adjusted R2 Score value :  0.037711701573402645
rmse :  1.4621579177826995
accuracy :  0.48541812839908427


In [15]:
param = {'splitter' : ['best', 'random'],
'criterion' : ['squared_error', 'absolute_error'],
'max_depth': [1,2,3,4,5,6,7,8,9,11,10],
'min_samples_split': [0.5,1.5,2.5,1,2,3,4,5,6,7,9],
'min_samples_leaf': [0.5,1,1.25,1.5,1.75,2,2.5,3,4,5]
}

In [16]:
grid_dt= GridSearchCV(estimator=dtr, param_grid=param, cv=2)
grid_dt.fit(x_train_scaled,y_train)

5456 fits failed out of a total of 9680.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
528 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/tree/_classes.py", line 1247, in fit
    super().fit(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/tree/_classes.py", line 177, in fit
    self._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/skle

In [17]:
grid_dt.best_params_

{'criterion': 'squared_error',
 'max_depth': 5,
 'min_samples_leaf': 3,
 'min_samples_split': 3,
 'splitter': 'random'}

In [21]:
dtr_hyper_tune = DecisionTreeRegressor(criterion= 'squared_error',
 max_depth = 5,
 min_samples_leaf= 3,
 min_samples_split= 3,
 splitter= 'random')
dtr_hyper_tune.fit(x_train_scaled, y_train)

dtr_hyper_tune_prediction = dtr_hyper_tune.predict(x_test_scaled)

In [22]:
accuracy = dtr_hyper_tune.score(x_train_scaled, y_train)
print('accuracy : ',accuracy)

accuracy :  0.08127070108398526


In [27]:
score = dtr_hyper_tune.score(x_test_scaled, y_test)
score
# y_pred = dtr_hyper_tune.predict(x_test)

-0.028091000929077348

**Random Forest Regressor**

In [11]:
rf = RandomForestRegressor()
rf.fit(x_train_scaled, y_train)

rf_prediction = rf.predict(x_test_scaled)

In [12]:
print('Random_Forest Regression : ')
rf_mse = mean_squared_error(y_test, rf_prediction)
print('mean_squared_error : ',rf_mse)
rf_mae = mean_absolute_error(y_test, rf_prediction)
print('mean_absolute_error : ',rf_mae)
rf_r2 = r2_score(y_test, rf_prediction)
print('r2_score : ',rf_r2)
rf_Adjusted_r2 = 1 - (1-rf_r2)*len(y_test - 1)/(len(y_test)-x_test_scaled.shape[1]-1)
print('Adjusted R2 Score value : ',rf_Adjusted_r2)
rf_rmse = np.sqrt(rf_mse)
print('rmse : ',rf_rmse)
rf_accuracy = rf.score(x_train_scaled, y_train)
print('accuracy : ',rf_accuracy)

Random_Forest Regression : 
mean_squared_error :  1.8594107134542295
mean_absolute_error :  1.1471421147578786
r2_score :  0.18416374552361436
Adjusted R2 Score value :  0.16306453204577676
rmse :  1.3636021096545097
accuracy :  0.4589441519408821


In [26]:
score = rf.score(x_test_scaled, y_test)
score

0.18416374552361436