# Mohit Anand Srivastava
# GO_STP_3020

# Task:=>

Train SVM classifier using sklearn digits dataset (i.e. from sklearn.datasets import load_digits) and then,
1. Measure accuracy of your model using different kernels such as rbf and linear.
2. Tune your model further using regularization and gamma parameters and try to come up with highest accurancy score
3. Use 80% of samples as training data size


Task Link:-[GoEduHub Machine Learning With Python Summer Training](https://www.goeduhub.com/11639/support-vector-machine-classification-python-sklearn-dataset)

# SK-Learn Digits Datasets With SVM

## Importing Important Libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Data Preprocessing
import pandas as pd 

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Load sklearn digits dataset
from sklearn.datasets import load_digits

# import train test split
from sklearn.model_selection import train_test_split

# Feature Scaling
from sklearn.preprocessing import StandardScaler

# Machine Learning
from sklearn.svm import SVC

# Confusion Matrix and Classification Reports
from sklearn.metrics import confusion_matrix,classification_report


# Hyper Parameters Tunnings
from sklearn.model_selection import GridSearchCV

### Avoid warnings

In [None]:
import warnings
warnings.filterwarnings('ignore')

## Load Datasets and collecting information from it

In [None]:
digits=load_digits() 

In [None]:
digits.data.shape

In [None]:
digits.data

In [None]:
digits.target

In [None]:
digits.target_names

In [None]:
digits.target[21]

In [None]:
plt.gray()
plt.matshow(digits.images[21])
plt.show()

In [None]:
for i in range(10):
    plt.gray()
    plt.matshow(digits.images[i])
    plt.show()

### Convert datasets into Pandas DataFrame

In [None]:
df=pd.DataFrame(digits.data,columns=digits.feature_names)
df['target']=digits.target

## Data Preprocessing

In [None]:
df.head() 

In [None]:
df.info()

## Data Visualization

In [None]:
sns.pairplot(df.iloc[:,55:],hue='target',palette='deep')

In [None]:
colors = ['red','blue','green','yellow','black','skyblue','pink','orange','lime','#eb9605']
markers = ['p','D',',','v','^','<','h','d','H','+']
for i,v in enumerate(df.drop(['pixel_7_2'],axis=1).columns[40:63]):
    sns.set (style="darkgrid")
    g=sns.jointplot(
        x=v,
        y='pixel_7_2',
        data=df,
        hue='target',
        color=colors[df.target[i]],
        marker=markers[df.target[i]],
        s=100, 
        palette='husl'
    )
    g.plot_joint(sns.kdeplot, color="r", zorder=0, levels=6)
    g.plot_marginals(sns.violinplot,palette='hls', clip_on=False)

In [None]:
counts=df.target.value_counts()
counts

In [None]:
sns.set(style='darkgrid')
sns.countplot(x=df['target'],saturation=1,palette='husl')
plt.title('CountPlot')
plt.show()

## Dividing Data into x and y & Split into train and test data

In [None]:
x=df
y=digits.target

In [None]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=.2,random_state=42)

In [None]:
xtrain.shape, xtest.shape , ytrain.shape, ytest.shape

## Creating Model for SVC() and train the model

In [None]:
svm=SVC()

In [None]:
svm.fit(xtrain,ytrain)
svm.score(xtest,ytest)

### Finding best values of random state for train_test_split

In [None]:
acc=[]
for i in range(1,50):
    xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=.2,random_state=i)
    svm=SVC()
    svm.fit(xtrain,ytrain)
    print(f"value={i} and accuracy={svm.score(xtest,ytest)}")
    acc.append(svm.score(xtest,ytest))
maxAcc=max(acc)
random_state=acc.index(maxAcc)+1
print("=============================================================")
print("=============================================================")
print(f"Best value={random_state} and Best accuracy={maxAcc}")
print("=============================================================")
print("=============================================================")

In [None]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=.33,random_state=random_state)

In [None]:
svm=SVC()
svm.fit(xtrain,ytrain)
svm.score(xtest,ytest)

In [None]:
pred=svm.predict(xtest)
pred[:20]

In [None]:
ytest[:20]

## Confusion Matrix and Classification Report

In [None]:
print ("Confusion matrix")
print(confusion_matrix(ytest,pred))

In [None]:
print("EVALUATION ON TESTING DATA")
print(classification_report(ytest,pred))

## Hyper Parameters Tuning

In [None]:
C= [0.1, 1, 10, 100, 1000]
gamma= [1, 0.1, 0.01, 0.001, 0.0001]
kernel= ['rbf']
# defining parameter range
param_grid = {
    'C':C,
    'gamma':gamma,
    'kernel':kernel
 }

grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)

# fitting the model for grid search
grid.fit(xtrain, ytrain)

In [None]:
# best parameter
print(grid.best_params_)

# best estimator
print(grid.best_estimator_)

# best score
print(grid.best_score_)


pred = grid.predict(xtest)

# print classification report
print(confusion_matrix(ytest, pred))

In [None]:
# print classification report
print(classification_report(ytest, pred))