# KNeighborsClassifier With Train Test Split
-----------------

## Step 1: Import Required Modules

In [2]:
import os
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

## Step 2: Load `csv file` and Understand `X` and `y` Data

In [20]:
# Change Directory to location ,where we have csv file
os.chdir("C:\\Users\\Hi\\Google Drive\\01 DS ML DL NLP and AI With Python Lab Copy\\02 Lab Data\\Python")

In [21]:
# Load csv file into DataFrame
df = pd.read_csv("iris.csv")

In [22]:
# Get top 5 Rows
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [23]:
# Observe all the columns
df.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

In [24]:
# create a dataframe X with required input columns
X=df.loc[:,df.columns!="species"] 

In [25]:
type(X)

pandas.core.frame.DataFrame

In [26]:
X.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [27]:
y=df.species # create a series with target values

In [28]:
y.unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [29]:
y.replace(['setosa', 'versicolor', 'virginica'],[0,1,2],inplace=True)

In [30]:
y.unique()

array([0, 1, 2], dtype=int64)

In [31]:
X=X.values # converting df to 2d Numpy Array

In [32]:
type(X)

numpy.ndarray

In [33]:
X.ndim

2

In [34]:
X[:5]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [35]:
y=y.values # converting Series to 1d Numpy Array

In [36]:
type(y)

numpy.ndarray

In [37]:
y.ndim

1

In [38]:
y[:5]

array([0, 0, 0, 0, 0], dtype=int64)

## Step 2: Load and understand Data(Not required as we took same data from csv)

In [0]:
iris = datasets.load_iris()

In [0]:
X=iris["data"]

In [0]:
y=iris["target"]

## Step 3: Split Data for training and testing

In [70]:
seed=42

In [71]:
train_test_split?

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    test_size=0.3, # train : 105 # test : 45
                                                    random_state=seed, # reproduce # seed
                                                    stratify=y) # input data ratio(50:50:50) = train data ratio(35:35:35) = test data ratio(15:15:15)

1. Did you understand stratify ? 

    `Must Read:`https://en.wikipedia.org/wiki/Stratified_sampling
2. Did you understand random_state?
3. What is Balanced Data ?

In [73]:
# Count of each class in orginal data
unique, counts = np.unique(y, return_counts=True)
dict(zip(unique, counts))

{0: 50, 1: 50, 2: 50}

In [74]:
# Count of each class in train sample data
unique, counts = np.unique(y_train, return_counts=True)
dict(zip(unique, counts))

{0: 35, 1: 35, 2: 35}

In [75]:
# Count of each class in test sample data
unique, counts = np.unique(y_test, return_counts=True)
dict(zip(unique, counts))

{0: 15, 1: 15, 2: 15}

## Step 4: Fit The Model

In [76]:
knn = KNeighborsClassifier(n_neighbors=8)

In [77]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=8, p=2,
           weights='uniform')

## Step 5: Predict labels of test data

In [78]:
y_pred = knn.predict(X_test)

In [79]:
print("Test set predictions:\n {}".format(y_pred))

Test set predictions:
 [2 1 1 1 2 2 1 1 0 2 0 0 2 2 0 2 1 0 0 0 1 0 1 2 1 1 1 1 1 0 2 2 1 0 2 0 0
 0 0 1 1 0 1 2 1]


In [80]:
y_test

array([2, 1, 2, 1, 2, 2, 1, 1, 0, 2, 0, 0, 2, 2, 0, 2, 1, 0, 0, 0, 1, 0,
       1, 2, 2, 1, 1, 1, 1, 0, 2, 2, 1, 0, 2, 0, 0, 0, 0, 1, 1, 0, 2, 2,
       1], dtype=int64)

In [81]:
np.bincount(y_pred)

array([15, 18, 12], dtype=int64)

## Step 6: Accuracy

In [82]:
accuracy_score(y_test,y_pred)

0.9333333333333333

> **or**

In [83]:
print(knn.score(X_test, y_test))

0.9333333333333333


In [99]:
accuracy_score?

> In above exercise, How you know `n_neighbors=8`? do you have any way to find `Best Parameter` ???

## Step 7: Hyper Parameter Tuning 
1. Useing `our own code`
2. Useing `GridSearchCV`
3. Useing `RandomizedSearchCV`

### Method 1: Use your `own Code`

In [108]:
rs= []
for i in list(range(1,20)):
    knn = KNeighborsClassifier(n_neighbors=i)
    #print(knn)
    knn.fit(X, y)
    y_test_pred = knn.predict(X_test)
    rs.append((i,accuracy_score(y_test,y_test_pred)))    

In [105]:
rs[1]

[(1, 1.0),
 (2, 0.9555555555555556),
 (3, 0.9555555555555556),
 (4, 0.9555555555555556),
 (5, 0.9777777777777777),
 (6, 0.9777777777777777),
 (7, 0.9777777777777777),
 (8, 0.9777777777777777),
 (9, 0.9777777777777777),
 (10, 0.9555555555555556),
 (11, 0.9333333333333333),
 (12, 0.9555555555555556),
 (13, 0.9555555555555556),
 (14, 0.9777777777777777),
 (15, 0.9777777777777777),
 (16, 0.9777777777777777),
 (17, 0.9555555555555556),
 (18, 0.9333333333333333),
 (19, 0.9555555555555556)]

#### change multiple args

In [None]:
scores = [] # empty list
for i in range(1,11):
    for w in ['uniform','distance']:
        for a in ['ball_tree', 'kd_tree', 'brute']:
            knn_clf_obj=KNeighborsClassifier(n_neighbors=i,
                                             weights=w,
                                            algorithm=a)
            knn_clf_obj.fit(X_train,y_train)
            y_test_pred=knn_clf_obj.predict(X_test)
            scores.append((i,w,a,accuracy_score(y_test,y_test_pred)))
scores    

### Method 2: Use `GridSearchCV`

In [87]:
from sklearn.model_selection import GridSearchCV

In [88]:
GridSearchCV?

In [89]:
param_grid = {'n_neighbors': np.arange(1, 50)}

In [90]:
knn_cv = GridSearchCV(knn,param_grid,cv=5)

In [91]:
knn_cv.fit(X, 
           y)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=8, p=2,
           weights='uniform'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [92]:
knn_cv.best_params_

{'n_neighbors': 6}

In [93]:
knn_cv.best_score_

0.98

### Method 3: Use `RandomizedSearchCV`

In [94]:
from sklearn.model_selection import RandomizedSearchCV
RandomizedSearchCV?

In [95]:
knn_cv_rand = RandomizedSearchCV(knn,param_grid,random_state=seed,cv=5)

In [96]:
knn_cv_rand.fit(X,y)

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=8, p=2,
           weights='uniform'),
          fit_params=None, iid='warn', n_iter=10, n_jobs=None,
          param_distributions={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49])},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [97]:
knn_cv_rand.best_params_

{'n_neighbors': 14}

In [98]:
knn_cv_rand.best_score_

0.9666666666666667

`Home Work: `

**Exercise 1: In above code change param_grid code  and observe result**

```python 
param_grid = {'n_neighbors': np.arange(1, 50),
              'weights':['uniform','distance'],
              'algorithm':['ball_tree', 'kd_tree', 'brute'] 
             }
```

`Some Questions`
1. What is estimator ?
2. When to go for brute,kd tree and ball tree?
3. In KNN weights parameter has two values uniform and distance. What is the difference?
4. p=1 then what type of distance?
5. p=2 then what type of distance?

`Read Notes: `

https://www.kaggle.com/dkim1992/grid-search-vs-random-search