In [17]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np
import warnings 
warnings.filterwarnings('ignore')
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVR 

In [18]:
df=pd.read_csv('D:/Data/stud.csv')
df.head(5)

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [19]:
data = df.drop(columns=['math_score'],axis=1)

In [20]:
data

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75
...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,99,95
996,male,group C,high school,free/reduced,none,55,55
997,female,group C,high school,free/reduced,completed,71,65
998,female,group D,some college,standard,completed,78,77


In [21]:
y=df['math_score']

In [22]:
y.head()

0    72
1    69
2    90
3    47
4    76
Name: math_score, dtype: int64

In [23]:
num_colns = data.select_dtypes(exclude="object").columns
cat_colns = data.select_dtypes(include="object").columns

In [24]:
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer

In [25]:
transformer=StandardScaler()
one_hot_transformer = OneHotEncoder(sparse_output=False)
preprocessor=ColumnTransformer(
            [
                ("StandardScaler",transformer,num_colns),
                ("OneHotEncoder",one_hot_transformer,cat_colns)
            ]
)


In [26]:
from sklearn import set_config
set_config(transform_output="pandas")
X=preprocessor.fit_transform(data)

In [27]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape,X_test.shape

((800, 19), (200, 19))

In [28]:
def evaluation_model(true,predicted):
    mae = mean_absolute_error(true,predicted)
    mse = mean_squared_error(true,predicted)
    rmse = np.sqrt(mean_squared_error(true,predicted))
    r2_square = r2_score(true,predicted)
    return mae,rmse,r2_square

In [29]:
models = {
    "LR" : LinearRegression(),
    "DTR" : DecisionTreeRegressor(),
    "Rid" : Ridge(),
    "Lasso" : Lasso(),
    "AdB" : AdaBoostRegressor(),
    "RfR" : RandomForestRegressor() 
}


In [30]:
model_list = []
r2_list = []
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)
    y_train_predict = model.predict(X_train)
    y_test_predict= model.predict(X_test)
    model_train_mae,model_train_rmse,model_train_r2square = evaluation_model(y_train,y_train_predict)
    model_test_mae,model_test_rmse,model_test_r2square = evaluation_model(y_test,y_test_predict)
    #print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2square)) #{:.4f} represents 4 decimal points from the values 

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2square))
    r2_list.append(model_test_r2square)
    print('='*35)
    print('\n')

Model performance for Training set
- Root Mean Squared Error: 5.3260
- Mean Absolute Error: 4.2689
- R2 Score: 0.8742
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5.4016
- Mean Absolute Error: 4.2188
- R2 Score: 0.8801


Model performance for Training set
- Root Mean Squared Error: 0.2795
- Mean Absolute Error: 0.0187
- R2 Score: 0.9997
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 7.9505
- Mean Absolute Error: 6.2900
- R2 Score: 0.7402


Model performance for Training set
- Root Mean Squared Error: 5.3233
- Mean Absolute Error: 4.2650
- R2 Score: 0.8743
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5.3904
- Mean Absolute Error: 4.2111
- R2 Score: 0.8806


Model performance for Training set
- Root Mean Squared Error: 6.5938
- Mean Absolute Error: 5.2063
- R2 Score: 0.8071
----------------------------------
Model performance for Test set
- Root Me

In [31]:
result_val = pd.DataFrame(list(zip(model_list,r2_list)),columns=['Models','R2_score']).sort_values(by='R2_score',ascending=True)

In [32]:
result_val

Unnamed: 0,Models,R2_score
1,DTR,0.740238
3,Lasso,0.82532
4,AdB,0.851714
5,RfR,0.855536
0,LR,0.880095
2,Rid,0.880593
