#### 1. Importing Libraries

In [1]:
import wikipedia
import pandas as pd
import numpy as np

#### 2. Collecting Data

In [2]:
# Creating a dataset of 5 people each from 6 different Professional Fields

names = ["Virat Kohli", "Rohit Sharma", "MS Dhoni", "Hardik Pandya", "KL Rahul", "Shikhar Dhawan", 
         "Ravindra Jadeja", "Gundappa Viswanath", "Irfan Pathan", "Javagal Srinath", "Anil Kumble",
         "Sachin Tendulkar", "Kapil Dev", "Ravi Shastri", "Surinder Amarnath",
         "Narendra Modi", "Arun Jaitley", "Sonia Gandhi", "Mamata Banerjee", "Akhilesh Yadav", "Yogi Adityanath",
         "Naveen Patnaik", "N. Biren Singh", "Conrad Sangma", "Nitish Kumar", "Pema Khandu", 
         "Srinivasa Ramanujan", "Brahmagupta", "D. R. Kaprekar", "C. R. Rao", "Aryabhata", "Kiran Kedlaya", 
         "Kannan Soundararajan", "Umesh Vazirani", "Vinod Johri", "Pranab K. Sen", "C. P. Ramanujam", 
         "K. S. Chandrasekharan", 
         "Prasanta Chandra Mahalanobis", "Debabrata Basu", "Jayanta Kumar Ghosh", "Kantilal Mardia", 
         "K. C. Sreedharan Pillai", "Gopinath Kallianpur", "Nairanjana Dasgupta", "Susmita Datta", "Nalini Ravishanker", 
         "A. M. Mathai", "Amarjot Kaur", 
         "Sania Mirza", "Vijay Amritraj", "Rohan Bopanna", "Mahesh Bhupathi", "Leander Paes", "P. V. Sindhu",
         "Jyoti Randhawa", "Dhanraj Pillay", "Pullela Gopichand", "Mary Kom", "Karnam Malleswari", "Saina Nehwal", 
         "Abhinav Bindra", "Viswanathan Anand", 
         "Amitabh Bachchan", "Shah Rukh Khan", "Aishwarya Rai", "Deepika Padukone", "Ranveer Singh", "Zeenat Aman",
         "Neetu Singh", "Hema Malini", "Sharmila Tagore", "Varun Dhawan", "Vicky Kaushal", "Ali Fazal", "Ashutosh Rana",
         "Ajay Devgn", "Ayushmann Khurrana", "Sonakshi Sinha", "Parineeti Chopra", "Taapsee Pannu", "Kriti Sanon",
         "Bhumi Pednekar", "Sanya Malhotra"]

# Group Number by Field
# 1 : 'Cricket', 2 : 'Politics', 3 : 'Mathematics', 4 : 'Statistics', 5 : 'Sports', 6 : 'Acting'

group = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
         4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
         5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
         6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]

#### 3. Retrieve info from Wikipedia

In [3]:
# Wikipedia Links for eack celebrity
wiki = []
for i in names:
    wiki.append(wikipedia.page(i))

In [4]:
# Populating column:Field, based on the Group Number

# Create a dictionary for Group and Field mapping
dict_Prof = {1 : 'Cricket', 2 : 'Politics', 3 : 'Mathematics', 4 : 'Statistics', 5 : 'Sports', 6 : 'Acting', '' : 'NaN'} 

# Define a function to map the values 
def set_value(grp, dict_Prof):
    return dict_Prof[grp] 


In [5]:
# new datafranme
df_Indians = pd.DataFrame()    

# pass info into single dataframe
df_Indians['Names'] = names
df_Indians['Wiki Page'] = wiki
df_Indians['Group'] = group

# populate the Field of Profession based on Group Numner
df_Indians['Field'] = df_Indians['Group'].apply(set_value, args = (dict_Prof, )) 

# display
df_Indians

Unnamed: 0,Names,Wiki Page,Group,Field
0,Virat Kohli,<WikipediaPage 'Virat Kohli'>,1,Cricket
1,Rohit Sharma,<WikipediaPage 'Rohit Sharma'>,1,Cricket
2,MS Dhoni,<WikipediaPage 'MS Dhoni'>,1,Cricket
3,Hardik Pandya,<WikipediaPage 'Hardik Pandya'>,1,Cricket
4,KL Rahul,<WikipediaPage 'K. L. Rahul'>,1,Cricket
5,Shikhar Dhawan,<WikipediaPage 'Shikhar Dhawan'>,1,Cricket
6,Ravindra Jadeja,<WikipediaPage 'Ravindra Jadeja'>,1,Cricket
7,Gundappa Viswanath,<WikipediaPage 'Gundappa Viswanath'>,1,Cricket
8,Irfan Pathan,<WikipediaPage 'Irfan Pathan'>,1,Cricket
9,Javagal Srinath,<WikipediaPage 'Javagal Srinath'>,1,Cricket


#### 4. Creating Counting Vectors and a New Dataframe for storing them

In [6]:
# Counting Vectors

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

X = []
for i in df_Indians['Wiki Page']:
    X.append(i.content)
    
# a matrix of token counts
X = np.array(cv.fit_transform(X).toarray())
X

array([[ 4, 31,  0, ...,  0,  0,  0],
       [ 0,  3,  0, ...,  0,  0,  0],
       [ 1,  9,  0, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  1,  0, ...,  0,  0,  0]], dtype=int64)

In [7]:
X.shape

(84, 16640)

In [8]:
# Create dataframe of the counting vectors
df_X = pd.DataFrame(X)

In [9]:
# copy Names & Groups to the X dataframe
df_X['Names'] = df_Indians['Names']
df_X['Group'] = df_Indians['Group']

df_X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16632,16633,16634,16635,16636,16637,16638,16639,Names,Group
0,4,31,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Virat Kohli,1
1,0,3,0,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Rohit Sharma,1
2,1,9,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,MS Dhoni,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Hardik Pandya,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,KL Rahul,1
5,1,2,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,Shikhar Dhawan,1
6,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Ravindra Jadeja,1
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Gundappa Viswanath,1
8,34,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,1,Irfan Pathan,1
9,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Javagal Srinath,1


In [10]:
# sort the dataframe by column: Names
sorted_data = df_X.sort_values('Names')

sorted_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16632,16633,16634,16635,16636,16637,16638,16639,Names,Group
47,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,A. M. Mathai,4
61,0,14,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Abhinav Bindra,5
65,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Aishwarya Rai,6
76,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,Ajay Devgn,6
19,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Akhilesh Yadav,2
74,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Ali Fazal,6
48,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Amarjot Kaur,4
63,0,3,0,1,0,1,1,0,3,0,...,0,0,0,0,0,0,0,0,Amitabh Bachchan,6
10,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Anil Kumble,1
16,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Arun Jaitley,2


#### 5. Define Control and Response Variables

In [11]:
# define predicted variable
y = sorted_data['Group']


# define predictor variable
x = sorted_data.iloc[:, :-2]

#### 6. Train-Test Split

In [12]:
# stratified sampling to include observations from each group in the test partition

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y,            # predictor and predicted variables
                                                    test_size=0.40,  # 60:40 train-test split
                                                    random_state=0,  # seed =0
                                                    stratify=y)      # startified sampling = yes

In [13]:
x_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16630,16631,16632,16633,16634,16635,16636,16637,16638,16639
36,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
65,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,3,0,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
56,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
67,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
x_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16630,16631,16632,16633,16634,16635,16636,16637,16638,16639
60,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
y_train.head()

36    3
65    6
1     1
56    5
67    6
Name: Group, dtype: int64

In [16]:
y_test.head()

60    5
24    2
53    5
49    5
69    6
Name: Group, dtype: int64

#### 7. Defining the Clustering Algos

In [17]:
def cosine(x, y):
    return np.dot(x, y) / (np.sqrt(np.dot(x, x)) * np.sqrt(np.dot(y, y)))

def euclidean(x, y):
    return np.sqrt(np.sum((x - y) ** 2))

####  8. Implementing a 5-NN Clustering Algorithm on Train Data

In [18]:
from sklearn.neighbors import KNeighborsClassifier

# n_neighbors : number of neighbors that will vote for the class of the target point; default = 5. 
# weights = 'distance': points nearest to the target point have greater influence than those who are farther away. 
# algorithm = 'auto': leaves it to algorithm to make best choice among algos for selecting the indexing data structure.
# metric : how distances are calculated in space i.e euclidean distance / cosine similarity

In [19]:
def Eucl_Classifier(k, x_train, y_train):
    # define KNN Classifier for euclidean metric
    Eucld_classifier = KNeighborsClassifier(n_neighbors = k, algorithm = 'auto', metric = 'euclidean')
    
    # fit model on train data
    Eucld_classifier.fit(x_train, y_train)
    
    # predict for train data    
    eucl_y_pred = Eucld_classifier.predict(x_train)
    return(eucl_y_pred)

In [20]:
def Cos_Classifier(k, x_train, y_train):
    # define KNN Classifier for euclidean metric
    cosine_classifier = KNeighborsClassifier(n_neighbors = k, algorithm = 'auto', metric = 'cosine')
    
    # fit model on train data
    cosine_classifier.fit(x_train, y_train)
    
    # predict for train data
    cosine_y_pred = cosine_classifier.predict(x_train)
    return(cosine_y_pred)

#### 9. Predictions on Train Data

In [21]:
# dataframe for Train Predictions
df_Pred = pd.DataFrame()

df_Pred["Act_Groups"] = y_train
df_Pred["Names"] = df_X.Names

#  reorder columns
cols = ["Names", "Act_Groups"]
df_Pred = df_Pred[cols]    
df_Pred['Act_Prof'] = df_Pred['Act_Groups'].apply(set_value, args = (dict_Prof, )) 

eucl_y_pred3 = Eucl_Classifier(3, x_train, y_train)
df_Pred["Eucld_Pred_3"] = eucl_y_pred3
df_Pred['Eucld_Prof_3'] = df_Pred['Eucld_Pred_3'].apply(set_value, args = (dict_Prof, )) 

cosine_y_pred3 = Cos_Classifier(3, x_train, y_train)
df_Pred["Cosine_Pred_3"] = cosine_y_pred3
df_Pred['Cosine_Prof_3'] = df_Pred['Cosine_Pred_3'].apply(set_value, args = (dict_Prof, )) 

eucl_y_pred4 = Eucl_Classifier(4, x_train, y_train)
df_Pred["Eucld_Pred_4"] = eucl_y_pred4
df_Pred['Eucld_Prof_4'] = df_Pred['Eucld_Pred_4'].apply(set_value, args = (dict_Prof, )) 

cosine_y_pred4 = Cos_Classifier(4, x_train, y_train)
df_Pred["Cosine_Pred_4"] = cosine_y_pred4
df_Pred['Cosine_Prof_4'] = df_Pred['Cosine_Pred_4'].apply(set_value, args = (dict_Prof, )) 


eucl_y_pred5 = Eucl_Classifier(5, x_train, y_train)
df_Pred["Eucld_Pred_5"] = eucl_y_pred5
df_Pred['Eucld_Prof_5'] = df_Pred['Eucld_Pred_5'].apply(set_value, args = (dict_Prof, )) 

cosine_y_pred5 = Cos_Classifier(5, x_train, y_train)
df_Pred["Cosine_Pred_5"] = cosine_y_pred5
df_Pred['Cosine_Prof_5'] = df_Pred['Cosine_Pred_5'].apply(set_value, args = (dict_Prof, )) 


eucl_y_pred6 = Eucl_Classifier(6, x_train, y_train)
df_Pred["Eucld_Pred_6"] = eucl_y_pred6
df_Pred['Eucld_Prof_6'] = df_Pred['Eucld_Pred_6'].apply(set_value, args = (dict_Prof, )) 

cosine_y_pred6 = Cos_Classifier(6, x_train, y_train)
df_Pred["Cosine_Pred_6"] = cosine_y_pred6
df_Pred['Cosine_Prof_6'] = df_Pred['Cosine_Pred_6'].apply(set_value, args = (dict_Prof, )) 

  
# Print the DataFrame 
df_Pred

Unnamed: 0,Names,Act_Groups,Act_Prof,Eucld_Pred_3,Eucld_Prof_3,Cosine_Pred_3,Cosine_Prof_3,Eucld_Pred_4,Eucld_Prof_4,Cosine_Pred_4,Cosine_Prof_4,Eucld_Pred_5,Eucld_Prof_5,Cosine_Pred_5,Cosine_Prof_5,Eucld_Pred_6,Eucld_Prof_6,Cosine_Pred_6,Cosine_Prof_6
36,C. P. Ramanujam,3,Mathematics,1,Cricket,6,Acting,1,Cricket,6,Acting,1,Cricket,1,Cricket,1,Cricket,1,Cricket
65,Aishwarya Rai,6,Acting,6,Acting,6,Acting,6,Acting,6,Acting,6,Acting,6,Acting,6,Acting,6,Acting
1,Rohit Sharma,1,Cricket,1,Cricket,1,Cricket,1,Cricket,1,Cricket,1,Cricket,1,Cricket,1,Cricket,1,Cricket
56,Dhanraj Pillay,5,Tennis,1,Cricket,1,Cricket,5,Tennis,1,Cricket,5,Tennis,1,Cricket,2,Politics,1,Cricket
67,Ranveer Singh,6,Acting,6,Acting,6,Acting,6,Acting,6,Acting,6,Acting,6,Acting,6,Acting,6,Acting
12,Kapil Dev,1,Cricket,1,Cricket,1,Cricket,1,Cricket,1,Cricket,1,Cricket,1,Cricket,1,Cricket,1,Cricket
63,Amitabh Bachchan,6,Acting,6,Acting,6,Acting,6,Acting,6,Acting,6,Acting,6,Acting,6,Acting,6,Acting
34,Vinod Johri,3,Mathematics,3,Mathematics,3,Mathematics,3,Mathematics,3,Mathematics,2,Politics,3,Mathematics,2,Politics,3,Mathematics
52,Mahesh Bhupathi,5,Tennis,5,Tennis,5,Tennis,5,Tennis,5,Tennis,5,Tennis,1,Cricket,5,Tennis,1,Cricket
77,Ayushmann Khurrana,6,Acting,6,Acting,6,Acting,6,Acting,6,Acting,6,Acting,6,Acting,6,Acting,6,Acting


In [27]:
from sklearn.metrics import accuracy_score

print("For 3-NN : ")
print("Accuracy Score of Euclidean Classification : ", round(accuracy_score(y_train, eucl_y_pred3)*100, 4))
print("Accuracy Score of Cosine Classification : ", round(accuracy_score(y_train, cosine_y_pred3)*100, 4))



print("For 4-NN : ")
print("Accuracy Score of Euclidean Classification : ", round(accuracy_score(y_train, eucl_y_pred4)*100, 4))
print("Accuracy Score of Cosine Classification : ", round(accuracy_score(y_train, cosine_y_pred4)*100, 4))



print("For 5-NN : ")
print("Accuracy Score of Euclidean Classification : ", round(accuracy_score(y_train, eucl_y_pred5)*100, 4))
print("Accuracy Score of Cosine Classification : ", round(accuracy_score(y_train, cosine_y_pred5)*100, 4))



print("For 6-NN : ")
print("Accuracy Score of Euclidean Classification : ", round(accuracy_score(y_train, eucl_y_pred6)*100, 4))
print("Accuracy Score of Cosine Classification : ", round(accuracy_score(y_train, cosine_y_pred6)*100, 4))

For 3-NN : 
Accuracy Score of Euclidean Classification :  76.0
Accuracy Score of Cosine Classification :  76.0
For 4-NN : 
Accuracy Score of Euclidean Classification :  78.0
Accuracy Score of Cosine Classification :  72.0
For 5-NN : 
Accuracy Score of Euclidean Classification :  78.0
Accuracy Score of Cosine Classification :  68.0
For 6-NN : 
Accuracy Score of Euclidean Classification :  74.0
Accuracy Score of Cosine Classification :  68.0


###### Conclusion
<br> Hence, for Train data : <br>
    3-NN-Classification gives the best prediction, while **Cosine Similarity give a Higher Accuracy** than Euclidean Distance

####  Implementing a 5-NN Clustering Algorithm on Test Data

#### 10. Predictions on Test Data

In [23]:
# datagrame for Test Predictions
test_Predicted = pd.DataFrame()

test_Predicted["Act_Groups"] = y_test
test_Predicted["Names"] = df_X.Names

#  reorder columns
cols = ["Names", "Act_Groups"]
test_Predicted = test_Predicted[cols]    
test_Predicted['Act_Prof'] = test_Predicted['Act_Groups'].apply(set_value, args = (dict_Prof, )) 

eucl_test_pred3 = Eucl_Classifier(3, x_test, y_test)
test_Predicted["Eucld_Pred_3"] = eucl_test_pred3
test_Predicted['Eucld_Prof_3'] = test_Predicted['Eucld_Pred_3'].apply(set_value, args = (dict_Prof, )) 

cosine_test_pred3 = Cos_Classifier(3, x_test, y_test)
test_Predicted["Cosine_Pred_3"] = cosine_test_pred3
test_Predicted['Cosine_Prof_3'] = test_Predicted['Cosine_Pred_3'].apply(set_value, args = (dict_Prof, )) 

eucl_test_pred4 = Eucl_Classifier(4, x_test, y_test)
test_Predicted["Eucld_Pred_4"] = eucl_test_pred4
test_Predicted['Eucld_Prof_4'] = test_Predicted['Eucld_Pred_4'].apply(set_value, args = (dict_Prof, )) 

cosine_test_pred4 = Cos_Classifier(4, x_test, y_test)
test_Predicted["Cosine_Pred_4"] = cosine_test_pred4
test_Predicted['Cosine_Prof_4'] = test_Predicted['Cosine_Pred_4'].apply(set_value, args = (dict_Prof, ))

eucl_test_pred5 = Eucl_Classifier(5, x_test, y_test)
test_Predicted["Eucld_Pred_5"] = eucl_test_pred5
test_Predicted['Eucld_Prof_5'] = test_Predicted['Eucld_Pred_5'].apply(set_value, args = (dict_Prof, )) 

cosine_test_pred5 = Cos_Classifier(5, x_test, y_test)
test_Predicted["Cosine_Pred_5"] = cosine_test_pred5
test_Predicted['Cosine_Prof_5'] = test_Predicted['Cosine_Pred_5'].apply(set_value, args = (dict_Prof, ))

eucl_test_pred6 = Eucl_Classifier(6, x_test, y_test)
test_Predicted["Eucld_Pred_6"] = eucl_test_pred6
test_Predicted['Eucld_Prof_6'] = test_Predicted['Eucld_Pred_6'].apply(set_value, args = (dict_Prof, )) 

cosine_test_pred6 = Cos_Classifier(6, x_test, y_test)
test_Predicted["Cosine_Pred_6"] = cosine_test_pred6
test_Predicted['Cosine_Prof_6'] = test_Predicted['Cosine_Pred_6'].apply(set_value, args = (dict_Prof, ))
  
# Print the DataFrame 
test_Predicted

Unnamed: 0,Names,Act_Groups,Act_Prof,Eucld_Pred_3,Eucld_Prof_3,Cosine_Pred_3,Cosine_Prof_3,Eucld_Pred_4,Eucld_Prof_4,Cosine_Pred_4,Cosine_Prof_4,Eucld_Pred_5,Eucld_Prof_5,Cosine_Pred_5,Cosine_Prof_5,Eucld_Pred_6,Eucld_Prof_6,Cosine_Pred_6,Cosine_Prof_6
60,Saina Nehwal,5,Tennis,5,Tennis,5,Tennis,5,Tennis,5,Tennis,5,Tennis,5,Tennis,5,Tennis,5,Tennis
24,Nitish Kumar,2,Politics,2,Politics,2,Politics,2,Politics,2,Politics,2,Politics,2,Politics,2,Politics,2,Politics
53,Leander Paes,5,Tennis,2,Politics,5,Tennis,2,Politics,1,Cricket,1,Cricket,1,Cricket,5,Tennis,1,Cricket
49,Sania Mirza,5,Tennis,1,Cricket,5,Tennis,1,Cricket,5,Tennis,1,Cricket,5,Tennis,5,Tennis,5,Tennis
69,Neetu Singh,6,Acting,6,Acting,6,Acting,6,Acting,6,Acting,6,Acting,6,Acting,6,Acting,6,Acting
64,Shah Rukh Khan,6,Acting,1,Cricket,6,Acting,1,Cricket,6,Acting,1,Cricket,6,Acting,1,Cricket,6,Acting
26,Srinivasa Ramanujan,3,Mathematics,1,Cricket,3,Mathematics,3,Mathematics,3,Mathematics,3,Mathematics,3,Mathematics,3,Mathematics,3,Mathematics
4,KL Rahul,1,Cricket,2,Politics,1,Cricket,2,Politics,1,Cricket,2,Politics,1,Cricket,2,Politics,1,Cricket
73,Vicky Kaushal,6,Acting,2,Politics,6,Acting,2,Politics,6,Acting,2,Politics,6,Acting,2,Politics,6,Acting
75,Ashutosh Rana,6,Acting,4,Statistics,6,Acting,5,Tennis,6,Acting,5,Tennis,6,Acting,4,Statistics,1,Cricket


In [28]:
from sklearn.metrics import accuracy_score

print("For 3-NN : ")
print("Accuracy Score of Euclidean Classification : ", round(accuracy_score(y_test, eucl_test_pred3)*100, 4))
print("Accuracy Score of Cosine Classification : ", round(accuracy_score(y_test, cosine_test_pred3)*100, 4))



print("For 4-NN : ")
print("Accuracy Score of Euclidean Classification : ", round(accuracy_score(y_test, eucl_test_pred4)*100, 4))
print("Accuracy Score of Cosine Classification : ", round(accuracy_score(y_test, cosine_test_pred4)*100, 4))



print("For 5-NN : ")
print("Accuracy Score of Euclidean Classification : ", round(accuracy_score(y_test, eucl_test_pred5)*100, 4))
print("Accuracy Score of Cosine Classification : ", round(accuracy_score(y_test, cosine_test_pred5)*100, 4))



print("For 6-NN : ")
print("Accuracy Score of Euclidean Classification : ", round(accuracy_score(y_test, eucl_test_pred6)*100, 4))
print("Accuracy Score of Cosine Classification : ", round(accuracy_score(y_test, cosine_test_pred6)*100, 4))

For 3-NN : 
Accuracy Score of Euclidean Classification :  67.6471
Accuracy Score of Cosine Classification :  88.2353
For 4-NN : 
Accuracy Score of Euclidean Classification :  64.7059
Accuracy Score of Cosine Classification :  82.3529
For 5-NN : 
Accuracy Score of Euclidean Classification :  58.8235
Accuracy Score of Cosine Classification :  82.3529
For 6-NN : 
Accuracy Score of Euclidean Classification :  55.8824
Accuracy Score of Cosine Classification :  79.4118


##### Conclusion
<br> Hence, for Test data too: <br>
    3-NN-Classification gives the best prediction, while **Cosine Similarity give a Higher Accuracy** than Euclidean Distance