# Model Training

### 1.1. Import Data and required Packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

In [2]:
# Import the CSV Data as Pandas DataFrame
df = pd.read_csv('data/StudentsPerformance.csv')

In [3]:
# Top 5 rows
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


### 1.2. Preparing X and Y variables

In [7]:
x = df.drop(columns = ['math score'], axis = 1) 
y = df['math score']
# Note:
# Creating prediction model for math-score

In [8]:
# One-Hot Encoding: is a processes of representing categorial(non-numarical) features in binary form, because computer understand only numbers. 
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
# Note:
# StandardScaler: Ensures all features are on same scale, otherwise model will bias on features with large values.

# Create instance
num_trans = StandardScaler()
cat_trans = OneHotEncoder()

num_features = x.select_dtypes(exclude = 'object').columns
cat_features = x.select_dtypes(include = 'object').columns

# Apply transformations.
# Note: To apply different transformations on different columns in one step.

# Create Instance
preprocessor = ColumnTransformer([
    ('OneHotEncoder', cat_trans, cat_features),
    ('StandardScaler', num_trans, num_features)
]) # This Processor can be used for pre-processing or training-model.



In [10]:
# Apply Transformation.
x = preprocessor.fit_transform(x)

In [12]:
# Separate data for Train and Test
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [16]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((800, 19), (200, 19), (800,), (200,))

### 2. Model Selection