In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn import metrics
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

In [None]:
por = pd.read_csv('student-por.csv', parse_dates=True)
mat = pd.read_csv('student-mat.csv', parse_dates=True)

In [None]:
all_features = ['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime','failures', 'schoolsup','paid', 'famsup', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences']

Odrzucenie cech nieistotnych:

In [None]:
por = por.drop(['address','health', 'reason', 'guardian', 'Fjob', 'Mjob', 'traveltime', 'nursery', 'internet', 'G1', 'G2'], axis = 1)
mat = mat.drop(['address','health', 'reason', 'guardian', 'Fjob', 'Mjob', 'traveltime', 'nursery', 'internet', 'G1', 'G2'], axis = 1)

Przeskalowanie wartości cech nienumerycznych:

In [None]:
por['school'] = por['school'].apply(lambda x: 0 if x=='GP' else 1)
por['sex'] = por['sex'].apply(lambda x: 0 if x=='F' else 1)
por['famsize'] = por['famsize'].apply(lambda x: 0 if x=='GT3' else 1)
por['Pstatus'] = por['Pstatus'].apply(lambda x: 0 if x=='A' else 1)
por['schoolsup'] = por['schoolsup'].apply(lambda x: 0 if x=='yes' else 1)
por['famsup'] = por['famsup'].apply(lambda x: 0 if x=='yes' else 1)
por['paid'] = por['paid'].apply(lambda x: 0 if x=='yes' else 1)
por['activities'] = por['activities'].apply(lambda x: 0 if x=='yes' else 1)
por['higher'] = por['higher'].apply(lambda x: 0 if x=='yes' else 1)
por['romantic'] = por['romantic'].apply(lambda x: 0 if x=='yes' else 1)

mat['school'] = mat['school'].apply(lambda x: 0 if x=='GP' else 1)
mat['sex'] = mat['sex'].apply(lambda x: 0 if x=='F' else 1)
mat['famsize'] = mat['famsize'].apply(lambda x: 0 if x=='GT3' else 1)
mat['Pstatus'] = mat['Pstatus'].apply(lambda x: 0 if x=='A' else 1)
mat['schoolsup'] = mat['schoolsup'].apply(lambda x: 0 if x=='yes' else 1)
mat['famsup'] = mat['famsup'].apply(lambda x: 0 if x=='yes' else 1)
mat['paid'] = mat['paid'].apply(lambda x: 0 if x=='yes' else 1)
mat['activities'] = mat['activities'].apply(lambda x: 0 if x=='yes' else 1)
mat['higher'] = mat['higher'].apply(lambda x: 0 if x=='yes' else 1)
mat['romantic'] = mat['romantic'].apply(lambda x: 0 if x=='yes' else 1)

# *Influence on grades prediction for por dataset*

In [None]:
yp = por['G3']
por = por.drop('G3', axis = 1)

In [None]:
por.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 649 entries, 0 to 648
Data columns (total 21 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   school      649 non-null    int64
 1   sex         649 non-null    int64
 2   age         649 non-null    int64
 3   famsize     649 non-null    int64
 4   Pstatus     649 non-null    int64
 5   Medu        649 non-null    int64
 6   Fedu        649 non-null    int64
 7   studytime   649 non-null    int64
 8   failures    649 non-null    int64
 9   schoolsup   649 non-null    int64
 10  famsup      649 non-null    int64
 11  paid        649 non-null    int64
 12  activities  649 non-null    int64
 13  higher      649 non-null    int64
 14  romantic    649 non-null    int64
 15  famrel      649 non-null    int64
 16  freetime    649 non-null    int64
 17  goout       649 non-null    int64
 18  Dalc        649 non-null    int64
 19  Walc        649 non-null    int64
 20  absences    649 non-null    int6

Dzielimy zbiór por na treningowy i testowy:

In [None]:
xp_train, xp_test, yp_train, yp_test = train_test_split(por, yp, test_size=0.3, random_state=1)

Wybieramy współczynnik regularyzacji dla metody Lasso za pomocą grid search cross validation.

In [None]:
parameters = {'alpha':[0.0001, 0.001, 0.1, 0.2, 0.4, 0.8, 1.6, 3.2, 6.4]}
lassop = linear_model.Lasso()
clf = GridSearchCV(lassop, parameters)
clf.fit(xp_train, yp_train)
print(clf.best_params_)

{'alpha': 0.001}


In [None]:
lassop = linear_model.Lasso(alpha=0.001)
lassop.fit(xp_train, yp_train)

Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

Cechy, wybrane przez Lasso jako istotne:

In [None]:
best_features_lasso = []
lp_coefs = list(abs(lassop.coef_))
lp_coefs_sorted = sorted(lp_coefs, reverse = True)
fea = list(xp_train.columns)
for lc in lp_coefs_sorted:
  if lc > 0:
    print(fea[lp_coefs.index(lc)])
    best_features_lasso.append(fea[lp_coefs.index(lc)])

higher
school
failures
schoolsup
paid
famsize
sex
activities
studytime
romantic
Walc
famsup
Fedu
age
famrel
Dalc
freetime
Medu
goout
Pstatus
absences


Metoda wybiera wszystkie atrybuty, jako te, co mają wpływ na uczenie się modelu.

# **Ridge Regression**

 *For all features = For the best Lasso features*

In [None]:
parameters = {'alpha':[0.01, 0.02, 0.04, 0.16, 0.32, 0.64, 1.28, 2.56, 5.12, 10.24, 20.48, 40.96, 82]}
rdg = Ridge()
clf = GridSearchCV(rdg, parameters)
clf.fit(xp_train, yp_train)
print("The best lambda parameter for Ridge: ", clf.best_params_)

The best lambda parameter for Ridge:  {'alpha': 10.24}


In [None]:
model = Ridge(alpha = 10.24).fit(xp_train, yp_train)

yp_pred = model.predict(xp_train)
e_train = metrics.mean_squared_error(yp_train, yp_pred)
rmse_train = math.sqrt(e_train)

yp_test_pred = model.predict(xp_test)
e_test = metrics.mean_squared_error(yp_test_pred, yp_test)
rmse_test = math.sqrt(e_test)

print('Bias: %.3f'%e_train)
print('Generalization: %.3f'%e_test)
print('Accuracy for train set: %.3f'%rmse_train)
print('Accuracy for test set: %.3f'%rmse_test)
print('Mean average error train set: %.3f'%mean_absolute_error(yp_train, yp_pred))
print('Mean average error test set: %.3f'%mean_absolute_error(yp_test, yp_test_pred))

Bias: 6.389
Generalization: 8.480
Accuracy for train set: 2.528
Accuracy for test set: 2.912
Mean average error train set: 1.852
Mean average error test set: 2.089


In [None]:
lp_coefs = model.coef_.tolist()
lp_coefs = np.abs(lp_coefs).tolist()
lp_coefs_sorted = sorted(lp_coefs, reverse = True)
fea = list(xp_train.columns)
for lc in lp_coefs_sorted:
    print(fea[lp_coefs.index(lc)])
    best_features_lasso.append(fea[lp_coefs.index(lc)])

higher
failures
school
schoolsup
paid
famsize
sex
activities
studytime
romantic
Walc
Fedu
famsup
Dalc
famrel
freetime
age
Medu
goout
Pstatus
absences


## **Linear Regression**
*For all features = For the best lasso features*

In [None]:
model_p = LinearRegression() 
model_p.fit(xp_train, yp_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [None]:
yp_pred = model_p.predict(xp_train)
yp_test_pred = model_p.predict(xp_test)

ep_train = metrics.mean_squared_error(yp_train, yp_pred)
rmse_train = math.sqrt(ep_train)

ep_test = metrics.mean_squared_error(yp_test, yp_test_pred)
rmse_test = math.sqrt(ep_test)

print('Bias: %.3f'%ep_train)
print('Generalization: %.3f'%ep_test)
print('Accuracy for train set: %.3f'%rmse_train)
print('Accuracy for test set: %.3f'%rmse_test)
print('Mean average error train set: %.3f'%mean_absolute_error(yp_train, yp_pred))
print('Mean average error test set: %.3f'%mean_absolute_error(yp_test, yp_test_pred))

Bias: 6.359
Generalization: 8.549
Accuracy for train set: 2.522
Accuracy for test set: 2.924
Mean average error train set: 1.862
Mean average error test set: 2.097


Wykorzystujemy dwa estymatory - Ridge Regression oraz Linear Regression. Współczynnik regularyzacji dla regresji grzbietowej wybieramy jak poprzednio za pomocą metody grid search cross validation. Wyniki otrzymane w efekcie trenowania modeli tych estymatorów różnią się w małym stopniu.

Dla oszacowania modeli wykorzystujemy MSE (mean squered error), RMSE (root mean squered error) oraz MAE (mean absolute error). MAE jest miarą, która wskazuje na średni rozrzut między wartością rzeczywistą a przewidywalną. MAE i RMSE są nieco podobne swoją istotą - oba błędy wskazują na różnice między wartością rzeczywistą a przewidywalną, ale zaletą drugiej jest to, że podwyższa ona wagi błędów. RMSE dostaje się poprzez sumowanie różnic wartości podniesionych do kwadratu, następnie znajdowanie wartości średniej i dalsze pierwiastkowanie (inaczej, jest to pierwiastek z MSE). Zapewnia to zwiększenie wag błędów w sposób wykładniczy, wtedy jak MAE nie jest bardzo czuła na wartości odstające.
Pod względem tego, że skala ocen jest 20-punktowa, wyniki dokonanych obliczeń są zadowalające.



# *Influence on grades prediction for mat dataset*

In [None]:
ym = mat['G3']
mat = mat.drop('G3', axis = 1)

Dzielimy zbiór mat na zbiór treningowy i testowy:

In [None]:
xm_train, xm_test, ym_train, ym_test = train_test_split(mat, ym, test_size=0.3, random_state=1)

Wybieramy współćzynnik regularyzacji dla metody Lasso:

In [None]:
parameters = {'alpha':[0.001, 0.01, 0.1, 0.2, 0.4, 0.8, 1.6, 3.2, 6.4]}
lassom = linear_model.Lasso()
clf = GridSearchCV(lassom, parameters)
clf.fit(xm_train, ym_train)
print(clf.best_params_)

{'alpha': 0.1}


In [None]:
bestm_features_lasso = []
lassom = linear_model.Lasso(alpha=0.1)
lassom.fit(xm_train, ym_train)
lm_coefs = list(abs(lassom.coef_))
lm_coefs_sorted = sorted(lm_coefs, reverse = True)
fea = list(xm_train.columns)
for lc in lm_coefs_sorted:
  if lc != 0:
    print(fea[lm_coefs.index(lc)])
    bestm_features_lasso.append(fea[lm_coefs.index(lc)])

failures
romantic
schoolsup
famsize
Medu
higher
sex
studytime
famsup
age
freetime
paid
goout
famrel
absences
Walc


Metoda Lasso wyznaczyła 5 cech jako nieistotne.

# **Ridge regression**
*For the best lasso features*

Znajdujemy współczynnik regularyzacji dla regresji grzbietowej:

In [None]:
parameters = {'alpha':[0.01, 0.02, 0.04, 0.16, 0.32, 0.64, 1.28, 2.56, 5.12, 10.24, 20.48, 40.96, 82]}
rdg = Ridge()
clf = GridSearchCV(rdg, parameters)
clf.fit(xm_train[bestm_features_lasso], ym_train)
print("The best lambda parameter for Ridge: ", clf.best_params_)

The best lambda parameter for Ridge:  {'alpha': 10.24}


In [None]:
model = Ridge(alpha = 10.24).fit(xm_train[bestm_features_lasso], ym_train)

ym_pred = model.predict(xm_train[bestm_features_lasso])
e_train = metrics.mean_squared_error(ym_train, ym_pred)
rmse_train = math.sqrt(e_train)

ym_test_pred = model.predict(xm_test[bestm_features_lasso])
e_test = metrics.mean_squared_error(ym_test, ym_test_pred)
rmse_test = math.sqrt(e_test)

print('Bias: %.3f'%e_train)
print('Generalization: %.3f'%e_test)
print('Accuracy for train set: %.3f'%rmse_train)
print('Accuracy for test set: %.3f'%rmse_test)
print('Mean average error train set: %.3f'%mean_absolute_error(ym_train, ym_pred))
print('Mean average error test set: %.3f'%mean_absolute_error(ym_test, ym_test_pred))

Bias: 16.305
Generalization: 17.301
Accuracy for train set: 4.038
Accuracy for test set: 4.159
Mean average error train set: 3.188
Mean average error test set: 3.144


*For all features*

In [None]:
parameters = {'alpha':[0.01, 0.02, 0.04, 0.16, 0.32, 0.64, 1.28, 2.56, 5.12, 10.24, 20.48, 40.96, 82]}
rdg = Ridge()
clf = GridSearchCV(rdg, parameters)
clf.fit(xm_train, ym_train)
print("The best lambda parameter for Ridge: ", clf.best_params_)

The best lambda parameter for Ridge:  {'alpha': 20.48}


In [None]:
model = Ridge(alpha = 20.48).fit(xm_train, ym_train)

ym_pred = model.predict(xm_train)
e_train = metrics.mean_squared_error(ym_train, ym_pred)
rmse_train = math.sqrt(e_train)

ym_test_pred = model.predict(xm_test)
e_test = metrics.mean_squared_error(ym_test, ym_test_pred)
rmse_test = math.sqrt(e_test)

print('Bias: %.3f'%e_train)
print('Generalization: %.3f'%e_test)
print('Accuracy for train set: %.3f'%rmse_train)
print('Accuracy for test set: %.3f'%rmse_test)
print('Mean average error train set: %.3f'%mean_absolute_error(ym_train, ym_pred))
print('Mean average error test set: %.3f'%mean_absolute_error(ym_test, ym_test_pred))

Bias: 16.394
Generalization: 16.571
Accuracy for train set: 4.049
Accuracy for test set: 4.071
Mean average error train set: 3.195
Mean average error test set: 3.064


In [None]:
lp_coefs = model.coef_.tolist()
lp_coefs = np.abs(lp_coefs).tolist()
lp_coefs_sorted = sorted(lp_coefs, reverse = True)
fea = list(xp_train.columns)
for lc in lp_coefs_sorted:
    print(fea[lp_coefs.index(lc)])
    best_features_lasso.append(fea[lp_coefs.index(lc)])

higher
failures
school
schoolsup
paid
famsize
sex
activities
studytime
romantic
Walc
Fedu
famsup
Dalc
famrel
freetime
age
Medu
goout
Pstatus
absences


# **Linear Regression**
*For all features*

In [None]:
model_m = LinearRegression() 
model_m.fit(xm_train, ym_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [None]:
ym_pred = model_m.predict(xm_train)
ym_test_pred = model_m.predict(xm_test)

em_train = metrics.mean_squared_error(ym_train, ym_pred)
rmse_train = math.sqrt(em_train)

em_test = metrics.mean_squared_error(ym_test, ym_test_pred)
rmse_test = math.sqrt(em_test)

print('Bias: %.3f'%em_train)
print('Generalization: %.3f'%em_test)
print('Accuracy for train set: %.3f'%rmse_train)
print('Accuracy for test set: %.3f'%rmse_test)
print('Mean average error train set: %.3f'%mean_absolute_error(ym_train, ym_pred))
print('Mean average error test set: %.3f'%mean_absolute_error(ym_test, ym_test_pred))

Bias: 16.034
Generalization: 19.039
Accuracy for train set: 4.004
Accuracy for test set: 4.363
Mean average error train set: 3.150
Mean average error test set: 3.319


*For the best lasso features*

In [None]:
model_m.fit(xm_train[bestm_features_lasso], ym_train)

ym_pred = model_m.predict(xm_train[bestm_features_lasso])
ym_test_pred = model_m.predict(xm_test[bestm_features_lasso])

em_train = metrics.mean_squared_error(ym_train, ym_pred)
rmse_train = math.sqrt(em_train)

em_test = metrics.mean_squared_error(ym_test, ym_test_pred)
rmse_test = math.sqrt(em_test)

print('Bias: %.3f'%em_train)
print('Generalization: %.3f'%em_test)
print('Accuracy for train set: %.3f'% rmse_train)
print('Accuracy for test set: %.3f'%rmse_test)
print('Mean average error train set: %.3f'%mean_absolute_error(ym_train, ym_pred))
print('Mean average error test set: %.3f'%mean_absolute_error(ym_test, ym_test_pred))

Bias: 16.159
Generalization: 18.914
Accuracy for train set: 4.020
Accuracy for test set: 4.349
Mean average error train set: 3.158
Mean average error test set: 3.302


Po dokonaniu obliczeń na zbiorze danych z kursu matematyki od razu zauważyłyśmy słabszą wydajność modeli. Uzasadnić to można tym, że ten zbiór jest prawie dwa razy mniejszy od zbioru z języka Portugalskiego. Również na podstawie wykorzystanych metryk, możemy zauważyć pewną przewagę estymatora Ridge Regression, a mianowicie niższe błedy w przypadku zbioru testowego oraz dobrze widać działanie regularyzacji. 