In [32]:
import pandas as pd
import numpy as np
import scipy
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn import neighbors
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import mse, rmse

import matplotlib.pyplot as plt
%matplotlib inline


In [33]:
# read in the iris data
from sklearn.datasets import load_iris
iris = load_iris()
X = pd.DataFrame(iris.data[:, :], columns = iris.feature_names[:])
Y = pd.DataFrame(iris.target, columns =["Species"])

# split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 465)

print('The number of observations in the training set is : {}'.format(X_train.shape[0]))
print('The number of observations in the test set is : {}'.format(X_test.shape[0]))

The number of observations in the training set is : 120
The number of observations in the test set is : 30


## KNN Regression

In [39]:
# KNN model on iris dataset
knn = neighbors.KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train, y_train)

# Make predictions 
y_preds_train = knn.predict(X_train)

#evaluate knn using test set
y_preds_test = knn.predict(X_test)

score = cross_val_score(knn, X, Y, cv=5)
print("Unweighted Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))

print("R-squared of the model in the training set is: {}".format(knn.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in the test set is: {}".format(knn.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))


Unweighted Accuracy: 0.55 (+/- 0.92)
R-squared of the model in the training set is: 0.9639172072907013
-----Test set statistics-----
R-squared of the model in the test set is: 0.9743315508021391
Mean absolute error of the prediction is: 0.05333333333333332
Mean squared error of the prediction is: [0.016]
Root mean squared error of the prediction is: [0.12649111]
Mean absolute percentage error of the prediction is: Species    4.090909
dtype: float64


## OLS Regression

In [35]:
# We fit an OLS model using sklearn
lrm = LinearRegression()
lrm.fit(X_train, y_train)

# Make predictions 
y_preds_train = lrm.predict(X_train)

#evaluate OLS using test set
y_preds_test = lrm.predict(X_test)

scores = cross_val_score(lrm, X_train, y_train, cv=5)
print(scores)
print("Unweighted Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))

print("R-squared of the model in the training set is: {}".format(lrm.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in the test set is: {}".format(lrm.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))


[0.93193002 0.97422635 0.96296606 0.83410297 0.86941669]
Unweighted Accuracy: 0.32 (+/- 0.79)
R-squared of the model in the training set is: 0.930168240196863
-----Test set statistics-----
R-squared of the model in the test set is: 0.9274006305038807
Mean absolute error of the prediction is: 0.1677557694822595
Mean squared error of the prediction is: [0.04525361]
Root mean squared error of the prediction is: [0.21272895]
Mean absolute percentage error of the prediction is: Species    inf
dtype: float64
