# Supervised Learning Model Evaluation Lab

Complete the exercises below to solidify your knowledge and understanding of supervised learning model evaluation.

In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split as tts



## Regression Model Evaluation

In [5]:
from sklearn.datasets import load_boston

data = load_boston()

Xb = pd.DataFrame(data["data"], columns=data["feature_names"])
yb = pd.DataFrame(data["target"], columns=['MEDV'])

data = pd.concat([Xb, yb], axis=1)

## 1. Split this data set into training (80%) and testing (20%) sets.

The `MEDV` field represents the median value of owner-occupied homes (in $1000's) and is the target variable that we will want to predict.

In [6]:
Xb_train, Xb_test, yb_train, yb_test = tts(Xb,yb, test_size=0.2)



## 2. Train a `LinearRegression` model on this data set and generate predictions on both the training and the testing set.

In [7]:
from sklearn.linear_model import LinearRegression as LinReg
boston1=LinReg()
boston1.fit(Xb_train, yb_train)
yb_pred_test = boston1.predict(Xb_test)
yb_pred_train = boston1.predict(Xb_train)


## 3. Calculate and print R-squared for both the training and the testing set.

In [8]:
from sklearn import metrics
print('R2 - Coeficiente de Determinacion del training set', metrics.r2_score(yb_train, yb_pred_train))
print('R2 - Coeficiente de Determinacion del testing set', metrics.r2_score(yb_test, yb_pred_test))

R2 - Coeficiente de Determinacion del training set 0.7218050189552423
R2 - Coeficiente de Determinacion del testing set 0.778059411628957


## 4. Calculate and print mean squared error for both the training and the testing set.

In [9]:
print('MSE - Error Cuadratico Medio del training set', metrics.mean_squared_error(yb_train, yb_pred_train))
print('MSE - Error Cuadratico Medio del testing', metrics.mean_squared_error(yb_test, yb_pred_test))

MSE - Error Cuadratico Medio del training set 21.55786310520929
MSE - Error Cuadratico Medio del testing 24.81396999450374


## 5. Calculate and print mean absolute error for both the training and the testing set.

In [10]:
print('MAE - Error Medio Absoluto training set', metrics.mean_absolute_error(yb_train, yb_pred_train))
print('MAE - Error Medio Absoluto testing set', metrics.mean_absolute_error(yb_test, yb_pred_test))

MAE - Error Medio Absoluto training set 3.205941477506951
MAE - Error Medio Absoluto testing set 3.795229146384991


## Classification Model Evaluation

In [11]:
from sklearn.datasets import load_iris

data = load_iris()

X = pd.DataFrame(data["data"], columns=data["feature_names"])
y = pd.DataFrame(data["target"], columns=["class"])

data = pd.concat([X, y], axis=1)

## 6. Split this data set into training (80%) and testing (20%) sets.

The `class` field represents the type of flower and is the target variable that we will want to predict.

In [12]:
X_train, X_test, y_train, y_test = tts(X,y, test_size=0.2)

## 7. Train a `LogisticRegression` model on this data set and generate predictions on both the training and the testing set.

In [13]:
from sklearn.linear_model import LogisticRegression
log = LogisticRegression()
log.fit(X_train,y_train)



  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

## 8. Calculate and print the accuracy score for both the training and the testing set.

In [14]:
from sklearn.metrics import balanced_accuracy_score, accuracy_score, precision_score, recall_score, f1_score, fbeta_score, confusion_matrix

In [15]:
y_pred_test = log.predict(X_test)
y_pred_train = log.predict(X_train)


print('Accuracy training set',accuracy_score( y_train, y_pred_train))
print('Accuracy test set',accuracy_score( y_test, y_pred_test))



Accuracy training set 0.975
Accuracy test set 0.9666666666666667


## 9. Calculate and print the balanced accuracy score for both the training and the testing set.

In [16]:
print('Accuracy balanced training set',balanced_accuracy_score( y_train, y_pred_train))
print('Accuracy balanced test set',balanced_accuracy_score( y_test, y_pred_test))


Accuracy balanced training set 0.9736486486486488
Accuracy balanced test set 0.9743589743589745


## 10. Calculate and print the precision score for both the training and the testing set.

In [17]:
print('precision training set',precision_score( y_train, y_pred_train,average="weighted"))
print('precision test set',precision_score( y_test, y_pred_test,average="weighted"))

precision training set 0.9751750225835593
precision test set 0.9696969696969696


## 11. Calculate and print the recall score for both the training and the testing set.

In [18]:
print('Recall training set',recall_score( y_train, y_pred_train,average="weighted"))
print('Recall test set',recall_score( y_test, y_pred_test,average="weighted"))

Recall training set 0.975
Recall test set 0.9666666666666667


## 12. Calculate and print the F1 score for both the training and the testing set.

In [19]:
print('F1 training set',f1_score( y_train, y_pred_train,average="weighted"))
print('F1 test set',f1_score( y_test, y_pred_test,average="weighted"))

F1 training set 0.974983088110942
F1 test set 0.9667936507936508


## 13. Generate confusion matrices for both the training and the testing set.

In [20]:
confusion_matrix( y_train, y_pred_train)



array([[43,  0,  0],
       [ 0, 35,  2],
       [ 0,  1, 39]])

In [21]:
confusion_matrix( y_test, y_pred_test)

array([[ 7,  0,  0],
       [ 0, 12,  1],
       [ 0,  0, 10]])

## Bonus: For each of the data sets in this lab, try training with some of the other models you have learned about, recalculate the evaluation metrics, and compare to determine which models perform best on each data set.

In [22]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.linear_model import SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR


models={'ridge': Ridge(),
    'lasso': Lasso(),
    'sgd': SGDRegressor(),
    'knn': KNeighborsRegressor(),
    'grad': GradientBoostingRegressor(),}




In [24]:
#Boston


for name, model in models.items():
    model.fit(Xb_train, yb_train)

for name, model in models.items():
    y_pred = model.predict(Xb_test)
    print(f"------{name}------")
    print('MAE - ', metrics.mean_absolute_error(yb_test, y_pred))
    print('MSE - ', metrics.mean_squared_error(yb_test, y_pred))
    print('RMSE - ', np.sqrt(metrics.mean_squared_error(yb_test, y_pred)))
    print('R2 - ', metrics.r2_score(yb_test, y_pred))


------ridge------
MAE -  3.835078908119012
MSE -  25.458330717983266
RMSE -  5.045624908570123
R2 -  0.7722961340025266
------lasso------
MAE -  4.03082804665628
MSE -  36.5850422874187
RMSE -  6.048557041759522
R2 -  0.6727768344747858
------sgd------
MAE -  110813725875319.55
MSE -  1.2630519476125597e+28
RMSE -  112385583933730.56
R2 -  -1.1296962656858793e+26
------knn------
MAE -  5.00686274509804
MSE -  49.09234901960784
RMSE -  7.006593253472606
R2 -  0.5609092447382777
------grad------
MAE -  2.006955349499188
MSE -  8.498707445957518
RMSE -  2.915254267805386
R2 -  0.9239860396636665


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [25]:
#IRIS
for name, model in models.items():
    model.fit(X_train, y_train)

for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"------{name}------")
    print('MAE - ', metrics.mean_absolute_error(y_test, y_pred))
    print('MSE - ', metrics.mean_squared_error(y_test, y_pred))
    print('RMSE - ', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    print('R2 - ', metrics.r2_score(y_test, y_pred))

------ridge------
MAE -  0.175540619415313
MSE -  0.056227821543540786
RMSE -  0.23712406361131041
R2 -  0.8989919373469327
------lasso------
MAE -  0.45434118112222016
MSE -  0.3204733430463265
RMSE -  0.5661036504442685
R2 -  0.42429938374911413
------sgd------
MAE -  0.2269605967633053
MSE -  0.08143969270246057
RMSE -  0.2853764053008948
R2 -  0.8537011508339032
------knn------
MAE -  0.08666666666666666
MSE -  0.06266666666666666
RMSE -  0.2503331114069145
R2 -  0.8874251497005988
------grad------
MAE -  0.08672274259271619
MSE -  0.07199023959605098
RMSE -  0.2683099692446238
R2 -  0.8706762162945192


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
