In [16]:
import pandas as pd
import seaborn as sns
import pathlib 
import matplotlib.pyplot as plt
import numpy as np

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder, StandardScaler

In [6]:
inputpath = pathlib.Path.cwd() / 'input' / 'student-por.csv'
df_train = pd.read_csv(inputpath)

In [7]:
cont_columns = ['age', 'failures', 'absences', 'G1', 'G2', 'G3']
num_columns = ['traveltime', 'studytime', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'Medu', 'Fedu']

cat_columns = ['school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']

In [27]:
dummy_df = df_train.copy()
target = dummy_df.pop('G3')
le_df = dummy_df.copy()

le = LabelEncoder()

for col in cat_columns:
    le_df[col] = le.fit_transform(le_df[col])
    dummy_df = pd.get_dummies(dummy_df,columns=[col],drop_first=True)

print(le_df)

     school  sex  age  address  famsize  Pstatus  Medu  Fedu  Mjob  Fjob  ...  \
0         0    0   18        1        0        0     4     4     0     4  ...   
1         0    0   17        1        0        1     1     1     0     2  ...   
2         0    0   15        1        1        1     1     1     0     2  ...   
3         0    0   15        1        0        1     4     2     1     3  ...   
4         0    0   16        1        0        1     3     3     2     2  ...   
..      ...  ...  ...      ...      ...      ...   ...   ...   ...   ...  ...   
644       1    0   19        0        0        1     2     3     3     2  ...   
645       1    0   18        1        1        1     3     1     4     3  ...   
646       1    0   18        1        0        1     1     1     2     2  ...   
647       1    1   17        1        1        1     3     1     3     3  ...   
648       1    1   18        0        1        1     3     2     3     2  ...   

     romantic  famrel  free

In [10]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [13]:
def ml(data,target,model,pr=False):
    X_train,X_test,y_train,y_test = train_test_split(data, target, random_state=0)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    mae = metrics.mean_absolute_error(y_test, y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
    r2_square = metrics.r2_score(y_test, y_pred)
    if pr==True:
        print('MAE:', mae)
        print('MSE:', mse)
        print('RMSE:', rmse)
        print('R2 Square', r2_square)
        print('__________________________________')
    return model,mae,mse,rmse,r2_square

MLA = [
    LinearRegression(),
    SVR(kernel='rbf'),
    SVR(kernel='poly'),
    GradientBoostingRegressor(),
    RandomForestRegressor()
]

le_models = {}
dummy_models = {}

In [17]:
evaluation_metrics = ['MAE','MSE','RMSE','R2_Square']
le_df_performance = pd.DataFrame(columns=evaluation_metrics)
dummy_df_performance = pd.DataFrame(columns=evaluation_metrics)

for alg in MLA:
    le_model, le_mae, le_mse, le_rmse, le_r2_square = ml(le_df,target,alg)
    dummy_model, dummy_mae, dummy_mse, dummy_rmse, dummy_r2_square = ml(dummy_df,target,alg)
    
    le_models[_name] = le_model
    dummy_models[_name] = dummy_model
    
    le_df_performance = le_df_performance.append(pd.Series({'MAE':le_mae ,'MSE':le_mse ,'RMSE':le_rmse ,'R2_Square':le_r2_square},name=_name))
    dummy_df_performance = dummy_df_performance.append(pd.Series({'MAE':dummy_mae ,'MSE':dummy_mse ,'RMSE':dummy_rmse ,'R2_Square':dummy_r2_square},name=_name))

In [18]:
sortting = 'MAE'
print('Dummies Dataset')
display(dummy_df_performance.sort_values(sortting))
print('LabelEncoder Dataset')
display(le_df_performance.sort_values(sortting))

Dummies Dataset


Unnamed: 0,MAE,MSE,RMSE,R2_Square
SVR_rbf,0.806362,1.487032,1.219439,0.835249
RandomForestRegressor,0.808405,1.320408,1.14909,0.85371
LinearRegression,0.81912,1.303924,1.141895,0.855536
GradientBoostingRegressor,0.84668,1.515467,1.231043,0.832099
SVR_poly,0.857321,1.723845,1.312953,0.809012


LabelEncoder Dataset


Unnamed: 0,MAE,MSE,RMSE,R2_Square
RandomForestRegressor,0.772577,1.13482,1.065279,0.874271
LinearRegression,0.799074,1.252246,1.119038,0.861261
SVR_rbf,0.807649,1.497811,1.223851,0.834055
GradientBoostingRegressor,0.824949,1.387739,1.178023,0.84625
SVR_poly,0.847051,1.632308,1.277618,0.819153
