### Import all the necessary library

In [23]:
import numpy as np
import pandas  as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier


### Load the provided Dataset

In [24]:
df=pd.read_csv('Coded_Data.csv')


In [25]:
df

Unnamed: 0.1,Unnamed: 0,Cd_1,Cd_2,Cd_3,Cd_4,Cd_5,Cd_6,Cd_7,Cd_8,Cd_9,Cd_10,Result
0,1,53.1,63.4,33.0,46.2,47.3,21.2,44.3,36.1,46.6,65.0,Y
1,2,36.9,54.7,31.1,50.5,56.0,38.9,39.4,56.8,33.0,78.8,N
2,3,41.9,65.5,53.5,52.3,92.5,43.2,94.9,64.7,50.8,67.9,N
3,4,71.7,75.6,37.9,50.5,69.2,52.5,82.3,77.3,80.8,60.9,Y
4,5,74.3,51.8,36.4,40.9,74.7,42.2,65.1,36.2,77.6,74.9,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,58.7,56.4,49.5,38.1,62.8,35.2,43.6,17.9,59.3,71.2,Y
996,997,33.4,52.1,54.7,48.5,85.7,76.2,61.6,40.0,50.8,87.8,N
997,998,66.0,53.6,45.4,56.1,54.6,53.1,22.6,21.8,48.8,73.2,Y
998,999,63.0,47.0,23.6,40.7,97.5,56.6,50.0,59.4,67.7,62.7,Y


### In our dataframe, the first column looks like the index, let's set `index_col = 0`

In [26]:
df=pd.read_csv('Coded_Data.csv',index_col=0)


In [27]:
df.head()

Unnamed: 0,Cd_1,Cd_2,Cd_3,Cd_4,Cd_5,Cd_6,Cd_7,Cd_8,Cd_9,Cd_10,Result
1,53.1,63.4,33.0,46.2,47.3,21.2,44.3,36.1,46.6,65.0,Y
2,36.9,54.7,31.1,50.5,56.0,38.9,39.4,56.8,33.0,78.8,N
3,41.9,65.5,53.5,52.3,92.5,43.2,94.9,64.7,50.8,67.9,N
4,71.7,75.6,37.9,50.5,69.2,52.5,82.3,77.3,80.8,60.9,Y
5,74.3,51.8,36.4,40.9,74.7,42.2,65.1,36.2,77.6,74.9,Y


Great, this looks fine now!<br>

Let's overview the data, we can use `info()` here!

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 1 to 1000
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Cd_1    1000 non-null   float64
 1   Cd_2    1000 non-null   float64
 2   Cd_3    1000 non-null   float64
 3   Cd_4    1000 non-null   float64
 4   Cd_5    1000 non-null   float64
 5   Cd_6    1000 non-null   float64
 6   Cd_7    1000 non-null   float64
 7   Cd_8    1000 non-null   float64
 8   Cd_9    1000 non-null   float64
 9   Cd_10   1000 non-null   float64
 10  Result  1000 non-null   object 
dtypes: float64(10), object(1)
memory usage: 93.8+ KB


Let's split the data so that we can fit the `scaler` to the features only!

In [29]:
x=df.drop('Result',axis=1)
y=df['Result']

Let's fit `scaler` to the features now!

In [30]:
model=StandardScaler()
model.fit(x)

So, we have the fitted features to the `scaler` object. We will use this `scaler` object to transform all the features using `.transform()` method in `Scikit-learn` to do the `standardization` job by centering and scaling.<br>
Let's pass the features to `scaler.transform()` to get standardized features in `scaled_features`!

In [31]:
scaled_features=model.transform(x)
scaled_features

array([[-0.12252539,  0.1875694 , -0.91183199, ..., -1.48006982,
        -0.95256187, -0.64536551],
       [-1.08602779, -0.43340316, -1.02415132, ..., -0.20055606,
        -1.82621843,  0.6351032 ],
       [-0.78865051,  0.33745933,  0.30003449, ...,  0.28776079,
        -0.68275617, -0.3762815 ],
       ...,
       [ 0.64470801, -0.51191693, -0.17880055, ..., -2.36398512,
        -0.81123508,  0.11549271],
       [ 0.46628164, -0.98299956, -1.46751711, ..., -0.03984418,
         0.40289057, -0.85877696],
       [-0.39016495, -0.59756832, -1.43204784, ..., -0.56524838,
         0.33865112,  0.01342636]])

In [32]:
scaled_features

array([[-0.12252539,  0.1875694 , -0.91183199, ..., -1.48006982,
        -0.95256187, -0.64536551],
       [-1.08602779, -0.43340316, -1.02415132, ..., -0.20055606,
        -1.82621843,  0.6351032 ],
       [-0.78865051,  0.33745933,  0.30003449, ...,  0.28776079,
        -0.68275617, -0.3762815 ],
       ...,
       [ 0.64470801, -0.51191693, -0.17880055, ..., -2.36398512,
        -0.81123508,  0.11549271],
       [ 0.46628164, -0.98299956, -1.46751711, ..., -0.03984418,
         0.40289057, -0.85877696],
       [-0.39016495, -0.59756832, -1.43204784, ..., -0.56524838,
         0.33865112,  0.01342636]])

In [33]:
scaled_features.shape

(1000, 10)

`scaled_features` is a NumPy array, Let's convert this into the pandas `DataFrame`!<br>
We can use our `df.columns` to get the columns name and pass that to `DatFrame()` along with `scaled_features`, note we don't need `Results` column, [:-1] will work *(everything but the last one)*!

In [34]:
#Before Scaling
df

Unnamed: 0,Cd_1,Cd_2,Cd_3,Cd_4,Cd_5,Cd_6,Cd_7,Cd_8,Cd_9,Cd_10,Result
1,53.1,63.4,33.0,46.2,47.3,21.2,44.3,36.1,46.6,65.0,Y
2,36.9,54.7,31.1,50.5,56.0,38.9,39.4,56.8,33.0,78.8,N
3,41.9,65.5,53.5,52.3,92.5,43.2,94.9,64.7,50.8,67.9,N
4,71.7,75.6,37.9,50.5,69.2,52.5,82.3,77.3,80.8,60.9,Y
5,74.3,51.8,36.4,40.9,74.7,42.2,65.1,36.2,77.6,74.9,Y
...,...,...,...,...,...,...,...,...,...,...,...
996,58.7,56.4,49.5,38.1,62.8,35.2,43.6,17.9,59.3,71.2,Y
997,33.4,52.1,54.7,48.5,85.7,76.2,61.6,40.0,50.8,87.8,N
998,66.0,53.6,45.4,56.1,54.6,53.1,22.6,21.8,48.8,73.2,Y
999,63.0,47.0,23.6,40.7,97.5,56.6,50.0,59.4,67.7,62.7,Y


In [35]:
x.columns

Index(['Cd_1', 'Cd_2', 'Cd_3', 'Cd_4', 'Cd_5', 'Cd_6', 'Cd_7', 'Cd_8', 'Cd_9',
       'Cd_10'],
      dtype='object')

### Create the below output

In [36]:
df_scaled=pd.DataFrame(scaled_features,columns=x.columns,index=x.index)
df_scaled

Unnamed: 0,Cd_1,Cd_2,Cd_3,Cd_4,Cd_5,Cd_6,Cd_7,Cd_8,Cd_9,Cd_10
1,-0.122525,0.187569,-0.911832,0.318653,-1.035516,-2.305940,-0.801865,-1.480070,-0.952562,-0.645366
2,-1.086028,-0.433403,-1.024151,0.624941,-0.445471,-1.153296,-1.131088,-0.200556,-1.826218,0.635103
3,-0.788651,0.337459,0.300034,0.753154,2.030007,-0.873275,2.597862,0.287761,-0.682756,-0.376282
4,0.983718,1.058359,-0.622166,0.624941,0.449771,-0.267648,1.751290,1.066595,1.244427,-1.025795
5,1.138354,-0.640394,-0.710839,-0.058864,0.822788,-0.938396,0.595651,-1.473889,1.038861,0.273232
...,...,...,...,...,...,...,...,...,...,...
996,0.210537,-0.312064,0.063573,-0.258307,0.015714,-1.394244,-0.848897,-2.605053,-0.136721,-0.070082
997,-1.294192,-0.618981,0.370973,0.482481,1.568822,1.275723,0.360492,-1.239002,-0.682756,1.470191
998,0.644708,-0.511917,-0.178801,1.023827,-0.540420,-0.228575,-2.259851,-2.363985,-0.811235,0.115493
999,0.466282,-0.983000,-1.467517,-0.073110,2.369113,-0.000651,-0.418892,-0.039844,0.402891,-0.858777


In [37]:
#After Scaling

<br>Our data is ready for the Machine Learning part now!
## Let's do the train_test split
I am sure, you are very comfortable with this now!

In [38]:
x_train,x_test,y_train,y_test=train_test_split(df_scaled,y,test_size=0.3,random_state=42)

In [39]:
y_train

Unnamed: 0,Result
542,Y
441,N
483,N
423,Y
779,N
...,...
107,N
271,Y
861,N
436,Y


In [40]:
x_test

Unnamed: 0,Cd_1,Cd_2,Cd_3,Cd_4,Cd_5,Cd_6,Cd_7,Cd_8,Cd_9,Cd_10
522,-0.842178,1.586542,1.015331,-1.212785,-0.553985,-0.111357,0.420962,0.497922,-0.406527,0.403134
738,1.382204,0.415973,-0.438908,0.575080,-1.489919,-1.062126,-0.848897,-1.838581,0.929654,-0.933007
741,1.269200,0.130468,0.719754,0.959720,-0.364085,0.644048,-2.548761,-0.905216,-0.419374,1.089762
661,0.912348,-0.433403,-1.313817,1.038073,-0.940566,-1.081662,-1.809690,0.572097,0.312955,-0.014410
412,-0.015470,0.737166,0.323681,-0.094479,-0.242007,0.689633,0.508306,-0.206737,-0.181688,2.463019
...,...,...,...,...,...,...,...,...,...,...
469,-0.681595,-1.518321,0.087219,-0.165709,-0.676063,-0.423939,-1.278902,-2.431979,0.640577,-1.053631
936,0.038058,-0.419128,-1.041886,-0.664316,-1.279673,-0.860250,-0.129982,-0.849585,0.839719,0.430971
429,-0.842178,0.837092,0.057661,0.981089,-0.228442,0.670097,2.261921,0.683359,-0.027514,1.219665
8,-0.461535,-0.097935,0.211361,-1.910835,-0.364085,0.396588,0.716590,0.936790,0.229444,0.310347


## KNN
Our focus is to come up with a model that can predict the class in `Result` for the new data point. We don't know what k number will work best, let's start with k = 1 at the moment. <br>
We need to import the `KNeighborsClassifier` from `sklearn.neighbors`.

In [41]:
knn=KNeighborsClassifier(n_neighbors=1)
knn.fit(x_train,y_train)

Let's show the score now!

In [42]:
knn.score(x_test,y_test)

0.9333333333333333