### Exercise: Machine Learning Finding Optimal Model and Hyperparameters
For digits dataset in sklearn.dataset, please try following classifiers and find out the one that gives best performance. Also find the optimal parameters for that classifier.

* from sklearn import svm
* from sklearn.ensemble import RandomForestClassifier
* from sklearn.linear_model import LogisticRegression
* from sklearn.naive_bayes import GaussianNB
* from sklearn.naive_bayes import MultinomialNB
* from sklearn.tree import DecisionTreeClassifier

---

In [4]:
import pandas as pd
from sklearn import datasets
digits=datasets.load_digits()

In [5]:
dir(digits)

['DESCR', 'data', 'feature_names', 'frame', 'images', 'target', 'target_names']

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(digits.data,digits.target,test_size=0.3,random_state=42)

In [7]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

In [10]:
from sklearn.model_selection import GridSearchCV
classifier=GridSearchCV(svm.SVC(),{
    'kernel':['linear','rbf'],
    'C':[1,10]
},cv=5,return_train_score=True)
classifier.fit(X_train,y_train)
classifier.best_params_

{'C': 10, 'kernel': 'rbf'}

In [15]:
model_params={
    'svm':{
        'model':svm.SVC(gamma='auto'),
        'params':{
            'kernel':['linear','rbf'],
            'C':[1,10,20]
        }
    },
    'random_forest':{
        'model':RandomForestClassifier(),
        'params':{
            'n_estimators':[1,5,10],
        }
    },
    'logistic_regression':{
        'model':LogisticRegression(solver='liblinear'),
        'params':{
            'C':[1,5,10]
        }
    },
    'naivebayes':{
        'model':GaussianNB(),
        'params':{}
    },
    'naive_bayes_multinomial': {
        'model': MultinomialNB(),
        'params': {}
    },
    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini','entropy'],
        }
    }
}

In [16]:
scores=[]
for model_name,mp in model_params.items():
    clf=GridSearchCV(mp['model'],mp['params'],cv=5,return_train_score=True)
    clf.fit(X_train,y_train)
    scores.append({
        'model':model_name,
        'best_params':clf.best_params_,
        'best_score':clf.best_score_
    })
grid_search_res=pd.DataFrame(scores,columns=['model','best_params','best_score'])
grid_search_res

Unnamed: 0,model,best_params,best_score
0,svm,"{'C': 1, 'kernel': 'linear'}",0.968175
1,random_forest,{'n_estimators': 10},0.928413
2,logistic_regression,{'C': 1},0.944315
3,naivebayes,{},0.829811
4,naive_bayes_multinomial,{},0.902147
5,decision_tree,{'criterion': 'entropy'},0.844052



For me the winner is svm (C=1, kernel=linear) with 96.81% score. It could be different for you as I have limited my parameters to be certain values only

### Now with Randomized Search CV

In [19]:
from sklearn.model_selection import RandomizedSearchCV
scores=[]
for model_name,mp in model_params.items():
    clf=RandomizedSearchCV(mp['model'],mp['params'],cv=5,return_train_score=True)
    clf.fit(X_train,y_train)
    scores.append({
        'model':model_name,
        'best_params':clf.best_params_,
        'best_score':clf.best_score_
    })
random_search_res=pd.DataFrame(scores,columns=['model','best_params','best_score'])
random_search_res



Unnamed: 0,model,best_params,best_score
0,svm,"{'kernel': 'linear', 'C': 1}",0.968175
1,random_forest,{'n_estimators': 10},0.926826
2,logistic_regression,{'C': 1},0.944315
3,naivebayes,{},0.829811
4,naive_bayes_multinomial,{},0.902147
5,decision_tree,{'criterion': 'entropy'},0.842471


For me the winner is svm (C=1, kernel=linear) with 96.81% score. It could be different for you as I have limited my parameters to be certain values only
