In [2]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

In [3]:
df=pd.read_csv('study.csv')

In [21]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [22]:
x=df.drop(columns='math score',axis=1)
x

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75
...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,99,95
996,male,group C,high school,free/reduced,none,55,55
997,female,group C,high school,free/reduced,completed,71,65
998,female,group D,some college,standard,completed,78,77


In [23]:
y=df['math score']
y

0      72
1      69
2      90
3      47
4      76
       ..
995    88
996    62
997    59
998    68
999    77
Name: math score, Length: 1000, dtype: int64

In [24]:
num_features=x.select_dtypes(include='int64').columns
cat_features=x.select_dtypes(exclude='int64').columns
len(num_features)
len(cat_features)



5

In [25]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
pipeline=ColumnTransformer(
    [
        ('oh',OneHotEncoder(),cat_features),
        ('sc',StandardScaler(),num_features)
    ]
)

In [29]:
x=pipeline.fit_transform(x)
x

array([[ 1.        ,  0.        ,  0.        , ...,  1.        ,
         0.19399858,  0.39149181],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         1.42747598,  1.31326868],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         1.77010859,  1.64247471],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.12547206, -0.20107904],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.60515772,  0.58901542],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         1.15336989,  1.18158627]], shape=(1000, 19))

In [30]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)
x_test
x_train

array([[ 0.        ,  1.        ,  0.        , ...,  0.        ,
         0.8792638 ,  0.78653904],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.81073728,  0.72069783],
       [ 0.        ,  1.        ,  0.        , ...,  1.        ,
        -0.0801075 , -0.46444386],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
        -0.49126664, -0.99117351],
       [ 0.        ,  1.        ,  0.        , ...,  0.        ,
        -1.45063795, -0.99117351],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         1.4960025 ,  1.37910989]], shape=(700, 19))

In [31]:
from sklearn.metrics import mean_absolute_error,root_mean_squared_error,r2_score
def evaluate(true,pred):
    mae,rmse,r2=mean_absolute_error(true,pred),root_mean_squared_error(true,pred),r2_score(true,pred)
    return mae,rmse,r2


In [34]:
models={
    "Linear regression":LinearRegression(),
    "kneighbouregression":KNeighborsRegressor(),
    "Decision tree":DecisionTreeRegressor(),
    "Random forest":RandomForestRegressor()
}
model_list=[]
r2_list=[]

for name, model in models.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    mae, rmse, r2 = evaluate(y_test, y_pred)

    model_list.append(name)
    r2_list.append(r2)

    print(f"{name} -> R2: {r2:.3f}")

    

Linear regression -> R2: 0.876
kneighbouregression -> R2: 0.772
Decision tree -> R2: 0.730
Random forest -> R2: 0.847


In [None]:
pd.DataFrame(list(zip(model_list,r2_list)),columns=['model_name','r2_score']).sort_values(by='r2_score',ascending=True)


Unnamed: 0,model_name,r2_score
2,Decision tree,0.730397
1,kneighbouregression,0.77202
3,Random forest,0.846574
0,Linear regression,0.875863


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB
