## Model Development

In [1]:
# import all required libraries

import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor


In [2]:
raw_df = pd.read_csv('data\StudentsPerformance.csv')
raw_df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [3]:
raw_df.columns
# math score is our target column

X = raw_df.drop(columns=['math score'], axis=1)
Y = raw_df['math score']

print(X.head())
print(Y.head())


   gender race/ethnicity parental level of education         lunch  \
0  female        group B           bachelor's degree      standard   
1  female        group C                some college      standard   
2  female        group B             master's degree      standard   
3    male        group A          associate's degree  free/reduced   
4    male        group C                some college      standard   

  test preparation course  reading score  writing score  
0                    none             72             74  
1               completed             90             88  
2                    none             95             93  
3                    none             57             44  
4                    none             78             75  
0    72
1    69
2    90
3    47
4    76
Name: math score, dtype: int64


In [4]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=50)
print(f'Total Records - : {raw_df.shape}')
print(f'X Train Records - : {x_train.shape}')
print(f'X Test Records - : {x_test.shape}')
print(f'Y Train Records - : {y_train.shape}')
print(f'Y Test Records - : {y_test.shape}')

Total Records - : (1000, 8)
X Train Records - : (750, 7)
X Test Records - : (250, 7)
Y Train Records - : (750,)
Y Test Records - : (250,)


In [8]:
print(type(x_train))

<class 'pandas.core.frame.DataFrame'>


In [5]:
# now we have to apply a pipeline on these data
numerical_columns = list(raw_df.select_dtypes(exclude='object').columns)
categorical_columns = list(raw_df.select_dtypes(include='object').columns)

target_column = 'math score'

numerical_columns.remove(target_column)


print(numerical_columns, categorical_columns)

['reading score', 'writing score'] ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']


In [18]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# now create a pipeline for both num & cat columns

num_pipeline = Pipeline(
    steps=[('simple_imputer', SimpleImputer(strategy='median')),
           ('standard_scaler', StandardScaler(with_mean=False))]
)

cat_pipeline = Pipeline(
    steps=[('simple_imputer', SimpleImputer(strategy='most_frequent')),
           ('one_hot_encoder', OneHotEncoder()),
           ('standard_scaler', StandardScaler(with_mean=False))]
)

preprocessor = ColumnTransformer([('numerical_pipeline', num_pipeline, numerical_columns),
                                  ('categorical_pipeline', cat_pipeline, categorical_columns)])


In [7]:
preprocessor.get_params

<bound method ColumnTransformer.get_params of ColumnTransformer(transformers=[('numerical_pipeline',
                                 Pipeline(steps=[('simple_imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('standard_scaler',
                                                  StandardScaler(with_mean=False))]),
                                 ['reading score', 'writing score']),
                                ('categorical_pipeline',
                                 Pipeline(steps=[('simple_imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('one_hot_encoder',
                                                  OneHotEncoder()),
                                                 ('standard_scaler',
                                                  StandardScaler(with_mean=False))]),
            

In [20]:
train_arr = preprocessor.fit_transform(x_train) 
print(x_train.shape)
print(type(train_arr),train_arr.shape)
train_arr[0]
type(train_arr)==np.ndarray

(750, 7)
<class 'numpy.ndarray'> (750, 19)


True

In [16]:
# we are not scaling target data here, we will try it later
# we will test if scaling the target improves the accuracy of the model.
train_arr = preprocessor.fit_transform(x_train) 
test_arr = preprocessor.transform(x_test)

type(train_arr)
np.array(y_train)

train_data_arr = np.c_[train_arr, np.array(y_train)]
test_data_arr = np.c_[test_arr, np.array(y_test)]

In [17]:
train_data_arr[0]

array([ 5.45706359,  4.96062705,  2.00445931,  0.        ,  3.4826884 ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  2.5       ,  0.        ,  0.        ,  0.        ,
        0.        ,  2.09202783,  2.0974939 ,  0.        , 68.        ])

In [34]:
test_data_arr[:,-1]

array([77., 72., 62., 40., 55., 68., 75., 63., 79., 69., 71., 63., 62.,
       57., 64., 53., 50., 64., 67., 53., 70., 71., 74., 81., 55., 77.,
       94., 65., 59., 83., 54., 74., 40., 72., 62., 99., 63., 53., 77.,
       59., 74., 81., 58., 75., 89., 79., 45., 61., 87., 54., 64., 55.,
       62., 61., 69., 65., 68., 65., 67., 57., 57., 49., 55., 69., 48.,
       84., 62., 79., 98., 40., 27., 48., 61., 81., 62., 82., 42., 63.,
       73., 77., 90., 82., 71., 72., 90., 59., 75., 59., 65., 75., 51.,
       90., 80., 58., 77., 72., 78., 65., 50., 57., 29., 58., 75., 69.,
       44., 74., 47., 67., 63., 58., 44., 79., 95., 59., 40., 75., 57.,
       68., 60., 71., 71., 67., 37., 80., 77., 71., 88., 65., 50., 71.,
       59., 59., 53., 71., 53., 75., 47., 79., 64., 92., 56., 45., 90.,
       49., 68., 75., 65., 74., 30., 65., 76., 91., 88., 73., 47., 58.,
       68., 37., 90., 73., 76., 81., 76., 63., 52., 51., 71., 75., 63.,
       46., 80., 65., 71., 75., 48., 80., 55., 55., 69., 77., 65

In [35]:
x_train, y_train, x_test, y_test = (
                train_data_arr[:,:-1],
                train_data_arr[:,-1],
                test_data_arr[:,:-1],
                test_data_arr[:,-1]
            )

models = {
                'Linear Regression': LinearRegression(),
                'Lasso Regression': Lasso(),
                'Ridge Regression': Ridge(),
                'KNeighbour regression': KNeighborsRegressor(),
                'Catboost': CatBoostRegressor(verbose=0), # verbose to avoid extra output.
                'XGboost': XGBRegressor(),
                'Decision tree Regression': DecisionTreeRegressor()
            }
# we are not doing hyperparameter tuninig at the moment, will be implementing during modular coding.
training_results = {}

for i in range(len(models.keys())):

    model = list(models.values())[i]
    model.fit(x_train, y_train)

    # will implement hyperparameter tuning part here
    # using gridsearch CV

    train_pred = model.predict(x_train)
    test_pred = model.predict(x_test)

    train_score = r2_score(y_train,train_pred)
    test_score = r2_score(y_test,test_pred)

    training_results[list(models.keys())[i]] = test_score

print(training_results)

{'Linear Regression': 0.8426776580200329, 'Lasso Regression': 0.8166904011754234, 'Ridge Regression': 0.8425534811793729, 'KNeighbour regression': 0.46971869112530307, 'Catboost': 0.8176196218133382, 'XGboost': 0.7723098319125012, 'Decision tree Regression': 0.634743601094715}


In [44]:
results_list = list(training_results.items())
best_model_name = max(results_list, key= lambda x: x[1])[0]
best_model_name
best_model = models.get(best_model_name)

best_model.coef_

array([ 4.31536796e+00,  1.03312316e+01, -7.55925623e+10, -7.55925623e+10,
       -9.03254951e+09, -1.23573706e+10, -1.46250232e+10, -1.37287634e+10,
       -1.11288435e+10, -8.29047695e+09, -6.51038257e+09, -8.13226223e+09,
       -4.62173633e+09, -8.66631936e+09, -7.81078280e+09, -7.92580240e+10,
       -7.92580240e+10, -6.35390464e+10, -6.35390464e+10])