# Classify Raisins with Hyperparameter Tuning Project

- [View Solution Notebook](./solution.html)
- [View Project Page](https://www.codecademy.com/projects/practice/mle-hyperparameter-tuning-project)

### 1. Explore the Dataset

In [2]:
# 1. Setup
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

raisins = pd.read_csv('Raisin_Dataset.csv')
raisins.head()

Unnamed: 0,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,Extent,Perimeter,Class
0,87524,442.246011,253.291155,0.819738,90546,0.758651,1184.04,0
1,75166,406.690687,243.032436,0.801805,78789,0.68413,1121.786,0
2,90856,442.267048,266.328318,0.798354,93717,0.637613,1208.575,0
3,45928,286.540559,208.760042,0.684989,47336,0.699599,844.162,0
4,79408,352.19077,290.827533,0.564011,81463,0.792772,1073.251,0


In [6]:
# 2. Create predictor and target variables, X and y
X = raisins.drop(columns=['Class'])
y = raisins['Class']

In [9]:
# 3. Examine the dataset
print(raisins.shape)
print(raisins['Class'].value_counts())

(900, 8)
0    450
1    450
Name: Class, dtype: int64


In [10]:
# 4. Split the data set into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=19)

### 2. Grid Search with Decision Tree Classifier

In [12]:
# 5. Create a Decision Tree model
tree = DecisionTreeClassifier()

In [13]:
# 6. Dictionary of parameters for GridSearchCV
parameters = {'max_depth': [3, 5, 7],
             'min_samples_split': [2, 3, 4]}

In [14]:
# 7. Create a GridSearchCV model
grid = GridSearchCV(tree, parameters)

# Fit the GridSearchCV model to the training data
grid.fit(X_train, y_train)


In [15]:
# 8. Print the model and hyperparameters obtained by GridSearchCV
print(grid.best_estimator_)

# Print best score
print(grid.best_score_)

# Print the accuracy of the final model on the test data
print(grid.score(X_test, y_test))


DecisionTreeClassifier(max_depth=3, min_samples_split=3)
0.8541666666666667
0.8555555555555555


In [18]:
# 9. Print a table summarizing the results of GridSearchCV
hyperparameter_grid = pd.DataFrame(grid.cv_results_['params'])
grid_scores = pd.DataFrame(grid.cv_results_['mean_test_score'])
results_df = pd.concat([hyperparameter_grid, grid_scores], axis=1)
results_df

Unnamed: 0,max_depth,min_samples_split,0
0,3,2,0.852778
1,3,3,0.854167
2,3,4,0.854167
3,5,2,0.843056
4,5,3,0.843056
5,5,4,0.843056
6,7,2,0.820833
7,7,3,0.823611
8,7,4,0.822222


### 2. Random Search with Logistic Regression

In [19]:
# 10. The logistic regression model
lr = LogisticRegression(solver='liblinear', max_iter=1000)

In [21]:
# 11. Define distributions to choose hyperparameters from
from scipy.stats import uniform
distributions = {'penalty': ['l1', 'l2'],
                'C': uniform(loc=0, scale=100)}

In [22]:
# 12. Create a RandomizedSearchCV model
clf = RandomizedSearchCV(lr, distributions, n_iter=8)

# Fit the random search model
clf.fit(X_train, y_train)

In [23]:
# 13. Print best esimator and best score
print(clf.best_estimator_)
# Print a table summarizing the results of RandomSearchCV
hyperparameter_grid = pd.DataFrame(clf.cv_results_['params'])
randomsearch_scores = pd.DataFrame(clf.cv_results_['mean_test_score'], columns=['score'])
results_df = pd.concat([hyperparameter_grid, randomsearch_scores], axis=1)
results_df

LogisticRegression(C=10.122731456228173, max_iter=1000, solver='liblinear')


Unnamed: 0,C,penalty,score
0,10.122731,l2,0.870833
1,28.373494,l2,0.870833
2,22.401364,l1,0.866667
3,77.838777,l1,0.8625
4,28.174797,l1,0.869444
5,49.955757,l1,0.8625
6,39.226672,l2,0.870833
7,90.271851,l2,0.869444
