# Model Validation Methods

#### 1.Evaluate using a train and a test set

In [12]:
# Evaluate using a train and a test set
from pandas import read_csv

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
filename = 'pima-indians-diabetes-data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size,
random_state=seed)
model = LogisticRegression(max_iter = 300)
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)


In [13]:
result*100.0

78.74015748031496

#### 2.Evaluate using Cross Validation

In [14]:
# Evaluate using Cross Validation
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
filename = '/content/sample_data/pima-indians-diabetes-data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=seed)
model = LogisticRegression(max_iter=400)
results = cross_val_score(model, X, Y, cv=kfold)


In [15]:
results.mean()*100.0

np.float64(77.21633629528367)

In [10]:
results.std()*100.0

np.float64(4.96837651757489)

#### 3.Evaluate using Leave One Out Cross Validation

In [11]:
# Evaluate using Leave One Out Cross Validation
from pandas import read_csv
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
filename = '/content/sample_data/pima-indians-diabetes-data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
loocv = LeaveOneOut()
model = LogisticRegression(max_iter=300)
results = cross_val_score(model, X, Y, cv=loocv)


In [16]:
results.mean()*100.0

np.float64(77.21633629528367)

In [17]:
results.std()*100.0

np.float64(4.96837651757489)

### Measuring Model Accuracies

#### 1. Regression Model

In [23]:
import pandas as pd
import sklearn
from sklearn.linear_model import LinearRegression

df = pd.read_csv('/content/sample_data/NewspaperData.csv')
X = df[["daily"]]
y = df["sunday"]
model = LinearRegression()

model.fit(X,y)
y_pred = model.predict(X)
# R^2 (coefficient of determination of the prediction)
model.score(X,y)


0.9180596895873295

In [24]:
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y, y_pred)
mae


80.1327444999346

In [25]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y, y_pred)
mse


11268.6920722415

In [26]:
from sklearn.metrics import root_mean_squared_error
rmse = root_mean_squared_error(y, y_pred)
rmse


106.15409588066538

In [27]:
from sklearn.metrics import r2_score
r2 = r2_score(y, y_pred)
r2

0.9180596895873295

In [30]:
from sklearn.linear_model import Ridge
ridge = Ridge(alpha = 0.1)
ridge.fit(X,y)
ridge_predict = ridge.predict(X)
print('Ridge score: ',ridge.score(X,y))

Ridge score:  0.9180596895873279


#### 2. Classification Model

In [33]:
# Confusion matrix with random forest
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
filename = '/content/sample_data/pima-indians-diabetes-data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = pd.read_csv(filename, names=names)
array = df.values
X = array[:, 0:8]
y = array[:, 8]
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state = 1)
rf = RandomForestClassifier(random_state = 4)
rf.fit(x_train,y_train)
y_pred = rf.predict(x_test)
cm = confusion_matrix(y_test,y_pred)
print('Accuracy: ',accuracy_score(y_test,y_pred))
print('Confusion matrix: \n',cm)
print('Classification report: \n',classification_report(y_test,y_pred))

Accuracy:  0.7619047619047619
Confusion matrix: 
 [[128  18]
 [ 37  48]]
Classification report: 
               precision    recall  f1-score   support

         0.0       0.78      0.88      0.82       146
         1.0       0.73      0.56      0.64        85

    accuracy                           0.76       231
   macro avg       0.75      0.72      0.73       231
weighted avg       0.76      0.76      0.75       231



In [36]:
y_test.shape

(231,)

# Hyperparameter Tuning

In [39]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

filename = '/content/sample_data/pima-indians-diabetes-data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = pd.read_csv(filename, names=names)
array = df.values
X = array[:, 0:8]
y = array[:, 8]

### One Hyperparameter Tuning - KNN

In [19]:
# grid search cross validation with 1 hyperparameter
from sklearn.model_selection import GridSearchCV
grid = {'n_neighbors': np.arange(1,50)}
knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, grid, cv=3) # GridSearchCV
knn_cv.fit(X,y)# Fit

# Print hyperparameter
print("Tuned hyperparameter k: {}".format(knn_cv.best_params_))
print("Best score: {}".format(knn_cv.best_score_))

Tuned hyperparameter k: {'n_neighbors': np.int64(8)}
Best score: 0.7552083333333334


### Multiple Hyperparameters Tuning - Decision Trees

In [40]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Define the parameter grid
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 3, 5]
}

# Instantiate the decision tree
dtree = DecisionTreeClassifier(random_state=42)

# Set up GridSearchCV
grid_search = GridSearchCV(dtree, param_grid, cv=3)

# Fit the model
grid_search.fit(X, y)

# Retrieve the best parameters
best_params = grid_search.best_params_
print("Best parameters:", best_params)

# Evaluate the model
best_dtree = grid_search.best_estimator_
y_pred = best_dtree.predict(X)
accuracy = accuracy_score(y, y_pred)
print("Accuracy:", accuracy)

Best parameters: {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 3, 'min_samples_split': 10}
Accuracy: 0.8294270833333334


## Save Model

In [41]:
import pickle

model = DecisionTreeClassifier(criterion='gini', max_depth=5, min_samples_leaf=3, min_samples_split=10)
model.fit(X,y)
# Save the model to disk
filename = 'model.pkl'
pickle.dump(model, open(filename, 'wb'))

In [42]:
# Load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))

# Use the loaded model to make predictions
predictions = loaded_model.predict(X)

In [44]:
predictions.shape

(768,)