# Import Library

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split

from sklearn.linear_model import Lasso, LinearRegression
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

from sklearn.metrics import mean_squared_error

In [None]:
df = pd.read_csv('../input/students-performance-in-exams/StudentsPerformance.csv')
df.head()

# Data Exploration

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
sns.boxplot(data=df)

# Data Preparation

In [None]:
df['final score'] = df.apply(lambda x: (x['math score'] + x['reading score'] + x['writing score']) / 3, axis=1)
df.head()

In [None]:
X = df.drop('final score', axis=1)
y = df['final score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_c = X_train.copy()
X_test_c = X_test.copy()

# Feature Enineering

In [None]:
# ordinal encoding parental level of education column
education = [['master\'s degree', 'bachelor\'s degree', 'associate\'s degree', 'some college', 'high school', 'some high school']]

ordinal_enc = OrdinalEncoder(categories=education, dtype=np.int64)
X_train_c.loc[:, ['parental level of education']] = ordinal_enc.fit_transform(X_train.loc[:, ['parental level of education']])
X_test_c.loc[:, ['parental level of education']] = ordinal_enc.transform(X_test.loc[:, ['parental level of education']])

In [None]:
# one hot encoding another category column
category = ['gender', 'race/ethnicity', 'lunch', 'test preparation course']

onehot_enc = OneHotEncoder(dtype=np.int64, sparse=False, drop='first')
onehot_train = pd.DataFrame(onehot_enc.fit_transform(X_train[category]), columns=onehot_enc.get_feature_names())
onehot_test = pd.DataFrame(onehot_enc.transform(X_test[category]), columns=onehot_enc.get_feature_names())

# set index again
onehot_train.index = X_train_c.index
onehot_test.index = X_test_c.index

# drop category columns
X_train_c.drop(category, inplace=True, axis=1)
X_test_c.drop(category, inplace=True, axis=1)

# concat encoded columns
X_train_c = pd.concat([X_train_c, onehot_train], axis=1)
X_test_c = pd.concat([X_test_c, onehot_test], axis=1)

# Modelling

In [None]:
def score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = mean_squared_error(y_test, y_pred, squared=False)
    return score

In [None]:
lasso = Lasso()
linreg = LinearRegression()
rf = RandomForestRegressor(n_estimators=100, random_state=42)
svr = SVR(C=0.5)
xgb = XGBRegressor(n_estimators=1000, random_state=42)

models = [lasso, linreg, rf, svr, xgb]
scores = []
for m in models:
    scores.append(score(m, X_train_c, X_test_c, y_train, y_test))

In [None]:
np.min(scores)