In [3]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline


In [4]:
%pip install catboost
%pip install xgboost

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [5]:
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

In [6]:
df = pd.read_csv('/workspaces/end-to-end-ml-project-/src/data/study2.csv')

In [7]:
df

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,total,average
0,female,group B,bachelor's degree,standard,none,218,72.666667
1,female,group C,some college,standard,completed,247,82.333333
2,female,group B,master's degree,standard,none,278,92.666667
3,male,group A,associate's degree,free/reduced,none,148,49.333333
4,male,group C,some college,standard,none,229,76.333333
...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,282,94.000000
996,male,group C,high school,free/reduced,none,172,57.333333
997,female,group C,high school,free/reduced,completed,195,65.000000
998,female,group D,some college,standard,completed,223,74.333333


In [8]:
x = df.drop('average',axis = 1)
y = df['average']

In [9]:
df.columns

Index(['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch',
       'test_preparation_course', 'total', 'average'],
      dtype='object')

In [10]:
b = df.columns

In [11]:
v = 0
while v<=4:
  print('unique values in each feature the dataset are ---{}'.format(df[b[v]].unique()))
  v = v + 1

unique values in each feature the dataset are ---['female' 'male']
unique values in each feature the dataset are ---['group B' 'group C' 'group A' 'group D' 'group E']
unique values in each feature the dataset are ---["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']
unique values in each feature the dataset are ---['standard' 'free/reduced']
unique values in each feature the dataset are ---['none' 'completed']


In [12]:
x

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,total
0,female,group B,bachelor's degree,standard,none,218
1,female,group C,some college,standard,completed,247
2,female,group B,master's degree,standard,none,278
3,male,group A,associate's degree,free/reduced,none,148
4,male,group C,some college,standard,none,229
...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,282
996,male,group C,high school,free/reduced,none,172
997,female,group C,high school,free/reduced,completed,195
998,female,group D,some college,standard,completed,223


In [13]:
y

0      72.666667
1      82.333333
2      92.666667
3      49.333333
4      76.333333
         ...    
995    94.000000
996    57.333333
997    65.000000
998    74.333333
999    83.000000
Name: average, Length: 1000, dtype: float64

In [14]:
#column transformer
num_features = x.select_dtypes(exclude = 'object').columns
cat_features = x.select_dtypes(include = 'object').columns



In [15]:
from sklearn.preprocessing import OneHotEncoder ,StandardScaler
from sklearn.compose import ColumnTransformer

In [16]:
numeric_transformer = StandardScaler()
oh_transformer =  OneHotEncoder()

In [17]:
preprocessor =  ColumnTransformer(
               [
                   ('standard scaler',numeric_transformer,num_features),
                   ('one hot encoder',oh_transformer,cat_features)
               ]
)

In [18]:
x = preprocessor.fit_transform(x)

In [19]:
x

array([[ 0.34357423,  1.        ,  0.        , ...,  1.        ,
         0.        ,  1.        ],
       [ 1.0219275 ,  1.        ,  0.        , ...,  1.        ,
         1.        ,  0.        ],
       [ 1.74706375,  1.        ,  0.        , ...,  1.        ,
         0.        ,  1.        ],
       ...,
       [-0.19443008,  1.        ,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.46053169,  1.        ,  0.        , ...,  1.        ,
         1.        ,  0.        ],
       [ 1.06871048,  1.        ,  0.        , ...,  0.        ,
         0.        ,  1.        ]])

In [20]:
x.shape

(1000, 18)

In [21]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state = 42,test_size = 0.2)


In [22]:
x_train.shape,x_test.shape

((800, 18), (200, 18))

In [23]:
y_train.shape,y_test.shape

((800,), (200,))

In [24]:
def evaluate_model(true,predicted):
  mse = mean_squared_error(true,predicted)
  mae = mean_absolute_error(true,predicted)
  r2 = r2_score(true,predicted)
  rmse = np.sqrt(mean_squared_error(true,predicted))
  return mae,rmse,r2

In [25]:
models = {
    'linear regression': LinearRegression(),
    'k nearest neighbors': KNeighborsRegressor(),
    'Ridge':Ridge(),
    'Lasso':Lasso(),
    'decision tree':DecisionTreeRegressor(),
    'xgboost':XGBRegressor(),
    'catboost regressor':CatBoostRegressor(),
    'random forest regressor':RandomForestRegressor(),
    'ada boost regressor':AdaBoostRegressor()
    }

model_list = []
r2_list = []
model_results = []

for model_name,model in models.items():

  model.fit(x_train,y_train)
  y_train_pred = model.predict(x_train)
  y_test_pred = model.predict(x_test)

  model_train_mae,model_train_rmse,model_train_r2 = evaluate_model(y_train,y_train_pred)
  model_test_mae,model_test_rmse,model_test_r2 = evaluate_model(y_test,y_test_pred)

  model_list.append(model_name)
  r2_list.append(model_test_r2)


  model_results.append({
      'model':model_name,
      'train_rmse':model_train_rmse,
      'test rmse': model_test_rmse,
      'train mae':model_train_mae,
      'test mae': model_test_mae,
      'train r2 score':model_train_r2,
      'test r2 score':model_test_r2

  })

  print(f"{model_name}")
  print('model performance for training set')
  print(f" -- root mean squared error : {model_train_rmse:.4f}")
  print(f" -- mean absolute error : {model_train_mae:.4f}")
  print(f" -- r2 score: {model_train_r2}")

  print('------------------------------------------------------')

  print('model performance for test set')
  print(f" -- root mean squared error : {model_test_rmse:.4f}")
  print(f"-- mean absolute error : {model_test_mae:.4f}")
  print(f" -- r2 score : {model_test_r2:.4f}")

  print('='*40)
  print('\n')


linear regression
model performance for training set
 -- root mean squared error : 0.0000
 -- mean absolute error : 0.0000
 -- r2 score: 1.0
------------------------------------------------------
model performance for test set
 -- root mean squared error : 0.0000
-- mean absolute error : 0.0000
 -- r2 score : 1.0000


k nearest neighbors
model performance for training set
 -- root mean squared error : 4.0886
 -- mean absolute error : 3.1097
 -- r2 score: 0.9161849955212298
------------------------------------------------------
model performance for test set
 -- root mean squared error : 5.0881
-- mean absolute error : 3.7117
 -- r2 score : 0.8792


Ridge
model performance for training set
 -- root mean squared error : 0.0208
 -- mean absolute error : 0.0169
 -- r2 score: 0.9999978410605245
------------------------------------------------------
model performance for test set
 -- root mean squared error : 0.0228
-- mean absolute error : 0.0179
 -- r2 score : 1.0000


Lasso
model performa

xgboost
model performance for training set
 -- root mean squared error : 0.0147
 -- mean absolute error : 0.0094
 -- r2 score: 0.9999989107336126
------------------------------------------------------
model performance for test set
 -- root mean squared error : 0.7011
-- mean absolute error : 0.1084
 -- r2 score : 0.9977


Learning rate set to 0.039525
0:	learn: 13.6830839	total: 61.3ms	remaining: 1m 1s
1:	learn: 13.2789868	total: 62.3ms	remaining: 31.1s
2:	learn: 12.8585556	total: 62.9ms	remaining: 20.9s
3:	learn: 12.4356155	total: 65.1ms	remaining: 16.2s
4:	learn: 12.0093603	total: 66ms	remaining: 13.1s
5:	learn: 11.7194691	total: 66.2ms	remaining: 11s
6:	learn: 11.3541415	total: 66.7ms	remaining: 9.46s
7:	learn: 10.9712782	total: 68.2ms	remaining: 8.46s
8:	learn: 10.6272468	total: 68.7ms	remaining: 7.57s
9:	learn: 10.2828425	total: 69.2ms	remaining: 6.85s
10:	learn: 9.9778232	total: 69.8ms	remaining: 6.27s
11:	learn: 9.6733564	total: 70.3ms	remaining: 5.79s
12:	learn: 9.3847351	tota

In [26]:
print(type(model_train_r2), model_train_r2)


<class 'numpy.float64'> 0.9953334678843535


In [27]:
model_list

['linear regression',
 'k nearest neighbors',
 'Ridge',
 'Lasso',
 'decision tree',
 'xgboost',
 'catboost regressor',
 'random forest regressor',
 'ada boost regressor']

In [28]:
r2_list

[1.0,
 0.8792331370496156,
 0.9999975737787185,
 0.9948006919534692,
 0.9978930172102397,
 0.9977071864852981,
 0.9885896763625903,
 0.9964222177888332,
 0.9928169122822277]

In [29]:
data = {
    'model':model_list,
    'r2_score':r2_list
}
d = pd.DataFrame(data)

In [30]:
d

Unnamed: 0,model,r2_score
0,linear regression,1.0
1,k nearest neighbors,0.879233
2,Ridge,0.999998
3,Lasso,0.994801
4,decision tree,0.997893
5,xgboost,0.997707
6,catboost regressor,0.98859
7,random forest regressor,0.996422
8,ada boost regressor,0.992817
