In [128]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

# Load the dataset

In [129]:
#dataset url: https://grouplens.org/datasets/movielens/100k/
user_col = ['User ID', 'Age', 'Sex', 'Occupation', 'Zip Code']
user_df = pd.read_csv('ml-100k/u.user', sep='|', names=user_col,encoding='latin-1')

user_df.head()

Unnamed: 0,User ID,Age,Sex,Occupation,Zip Code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [130]:
user_df.shape

(943, 5)

In [131]:
#Load u.item 
item_col = ['Movie ID', 'Title' ,'Release Date','Video Release Date', 'IMDb URL', 'Unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

movie_df = pd.read_csv('ml-100k/u.item', sep='|', names=item_col, encoding='latin-1')

movie_df.head()


Unnamed: 0,Movie ID,Title,Release Date,Video Release Date,IMDb URL,Unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [132]:
movie_df = movie_df[['Movie ID', 'Title']]
print(movie_df)

      Movie ID                                      Title
0            1                           Toy Story (1995)
1            2                           GoldenEye (1995)
2            3                          Four Rooms (1995)
3            4                          Get Shorty (1995)
4            5                             Copycat (1995)
...        ...                                        ...
1677      1678                          Mat' i syn (1997)
1678      1679                           B. Monkey (1998)
1679      1680                       Sliding Doors (1998)
1680      1681                        You So Crazy (1994)
1681      1682  Scream of Stone (Schrei aus Stein) (1991)

[1682 rows x 2 columns]


In [133]:
#Load the u.data
rating_col = ['User ID', 'Movie ID', 'Rating', 'Timestamp']

rating_df = pd.read_csv('ml-100k/u.data', sep='\t', names=rating_col, encoding='latin-1')

rating_df.head()


Unnamed: 0,User ID,Movie ID,Rating,Timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [134]:
#Drop the timestamp
rating_df = rating_df.drop('Timestamp', axis=1)
print(rating_df)

       User ID  Movie ID  Rating
0          196       242       3
1          186       302       3
2           22       377       1
3          244        51       2
4          166       346       1
...        ...       ...     ...
99995      880       476       3
99996      716       204       5
99997      276      1090       1
99998       13       225       2
99999       12       203       3

[100000 rows x 3 columns]


In [135]:
#Assigning X  and y values.
X = rating_df
y = rating_df['User ID']
print(X, y)
#Split into training and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, stratify=y, random_state=42)

       User ID  Movie ID  Rating
0          196       242       3
1          186       302       3
2           22       377       1
3          244        51       2
4          166       346       1
...        ...       ...     ...
99995      880       476       3
99996      716       204       5
99997      276      1090       1
99998       13       225       2
99999       12       203       3

[100000 rows x 3 columns] 0        196
1        186
2         22
3        244
4        166
        ... 
99995    880
99996    716
99997    276
99998     13
99999     12
Name: User ID, Length: 100000, dtype: int64


# Compute RMSE

In [136]:
from sklearn.metrics import mean_squared_error
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [137]:
#Define the baseline model to always return 3.
def baseline(user_ID, movie_ID):
    return 3.0

In [138]:
#Function to compute the RMSE score obtained on the testing set by a model
def score(coll_filter):
    
    #Construct a list of user-movie tuples from the testing dataset
    id_pairs = zip(X_test['User ID'], X_test['Movie ID'])
    
    #Extract the actual ratings given by the users in the test data
    y_true = np.array(X_test['Rating'])
    
    #Predict the rating for every user-movie tuple
    y_pred = np.array([coll_filter(users, movies) for (users, movies) in id_pairs])
    
    #print(mean_squared_error(y_train, y_pred))

    #Return the final RMSE score
    return rmse(y_true, y_pred)

In [139]:
score(baseline)

1.2470926188539486

In [140]:
#Rating matrix of User Based Collaborative Filtering
#Build the ratings matrix using pivot_table function
rating_mat = X_train.pivot_table(values='Rating', index='User ID', columns='Movie ID')

rating_mat.head()

Movie ID,1,2,3,4,5,6,7,8,9,10,...,1669,1670,1671,1673,1674,1675,1676,1679,1681,1682
User ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,3.0,,,,,,,,,...,,,,,,,,,,


In [141]:
#User Based Collaborative Filter using Mean Ratings
def coll_filter_uMean(user_ID, movie_ID):
    #Check if movie_id exists in r_matrix
    if movie_ID in rating_mat:
        #Compute the mean of all the ratings given to the movie
        r_mean = rating_mat[movie_ID].mean()
    
    else:
        #Default to a rating of 3.0 in the absence of any information
        r_mean = 3.0
    
    return r_mean


In [142]:
#Calculate RMSE for the Mean model
score(coll_filter_uMean)

1.0234701463131335

In [143]:
# Weigted Mean
#Create a dummy ratings matrix with all null values imputed to 0
rm_temp = rating_mat.copy().fillna(0)

# Cosine Similarity

In [144]:
#Compute the cosine similarity matrix using the dummy ratings matrix
from sklearn.metrics.pairwise import cosine_similarity
cos_sim = cosine_similarity(rm_temp, rm_temp)


In [145]:
#Convert into pandas dataframe 
cos_sim = pd.DataFrame(cos_sim, index=rating_mat.index, columns=rating_mat.index)

cos_sim.head(10)

User ID,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
User ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.118076,0.029097,0.011628,0.264677,0.312419,0.308729,0.224269,0.026017,0.286411,...,0.308475,0.055872,0.197862,0.131367,0.152449,0.084456,0.293293,0.056765,0.103536,0.326491
2,0.118076,1.0,0.099097,0.10768,0.034279,0.152789,0.086705,0.078864,0.06894,0.092399,...,0.086927,0.259636,0.289092,0.318824,0.149105,0.186347,0.168034,0.106748,0.136796,0.080358
3,0.029097,0.099097,1.0,0.252131,0.026893,0.062539,0.039767,0.089474,0.078162,0.03767,...,0.040918,0.019031,0.065417,0.055373,0.086503,0.018418,0.096993,0.109631,0.092574,0.018987
4,0.011628,0.10768,0.252131,1.0,0.0,0.045543,0.078812,0.095354,0.059498,0.053879,...,0.024226,0.050703,0.056561,0.107294,0.098892,0.0,0.1329,0.142798,0.097066,0.015176
5,0.264677,0.034279,0.026893,0.0,1.0,0.202843,0.299619,0.163724,0.038474,0.153021,...,0.262547,0.048524,0.048312,0.022202,0.09191,0.066,0.156172,0.115842,0.124297,0.267574
6,0.312419,0.152789,0.062539,0.045543,0.202843,1.0,0.375963,0.131795,0.110944,0.400758,...,0.287549,0.080312,0.162988,0.182856,0.114262,0.09209,0.261859,0.097606,0.206104,0.187637
7,0.308729,0.086705,0.039767,0.078812,0.299619,0.375963,1.0,0.211282,0.107795,0.328923,...,0.290002,0.07417,0.094619,0.084235,0.11562,0.100625,0.233843,0.039199,0.224227,0.296332
8,0.224269,0.078864,0.089474,0.095354,0.163724,0.131795,0.211282,1.0,0.03704,0.183375,...,0.165008,0.066843,0.058766,0.068759,0.087159,0.129381,0.188662,0.121223,0.08391,0.273238
9,0.026017,0.06894,0.078162,0.059498,0.038474,0.110944,0.107795,0.03704,1.0,0.155435,...,0.011708,0.0,0.10171,0.034568,0.045002,0.052699,0.107486,0.055766,0.070065,0.088281
10,0.286411,0.092399,0.03767,0.053879,0.153021,0.400758,0.328923,0.183375,0.155435,1.0,...,0.278558,0.04931,0.153506,0.065471,0.060088,0.033686,0.197107,0.085402,0.118945,0.162538


In [149]:
#User Based Collaborative Filter using Weighted Mean Ratings
def coll_filter_wMean(user_ID, movie_ID):
    
    #Check if movie_id exists in r_matrix
    if movie_ID in rating_mat:
        
        #Get the similarity scores for the user in question with every other user
        ss = cos_sim[user_ID]
        
        #Get the user ratings for the movie in question
        mov_rating = rating_mat[movie_ID]
        
        #Extract the indices containing NaN in the m_ratings series
        ind = mov_rating[mov_rating.isnull()].index
        
        #Drop the NaN values from the m_ratings Series
        mov_rating = mov_rating.dropna()
        
        #Drop the corresponding cosine scores from the sim_scores series
        ss = ss.drop(ind)
        
        #Compute the final weighted mean
        wMean = np.dot(ss, mov_rating)/ ss.sum()
    
    else:
        #Default to a rating of 3.0 in the absence of any information
        wMean = 3.0
    
    return wMean

In [150]:
score(coll_filter_wMean)

1.0174483808407588

In [151]:
#Demographics
#Merge the original users dataframe with the training set 
comb_df = pd.merge(X_train, user_df)
comb_df.head()

Unnamed: 0,User ID,Movie ID,Rating,Age,Sex,Occupation,Zip Code
0,889,684,2,24,M,technician,78704
1,889,279,2,24,M,technician,78704
2,889,29,3,24,M,technician,78704
3,889,190,3,24,M,technician,78704
4,889,232,3,24,M,technician,78704


In [152]:
#Compute the mean rating of every movie by gender
gMean = comb_df[['Movie ID', 'Sex', 'Rating']].groupby(['Movie ID', 'Sex'])['Rating'].mean()
print(gMean)

Movie ID  Sex
1         F      3.827586
          M      3.918919
2         F      3.230769
          M      3.228916
3         F      2.785714
                   ...   
1675      M      3.000000
1676      M      2.000000
1679      M      3.000000
1681      M      3.000000
1682      M      3.000000
Name: Rating, Length: 3048, dtype: float64


In [153]:
#Set the index of the users dataframe to the user_id
user_df = user_df.set_index('User ID')

In [154]:
#Gender Based Collaborative Filter using Mean Ratings
def coll_filter_gMean(user_ID, movie_ID):
    
    #Check if movie_id exists in r_matrix (or training set)
    if movie_ID in rating_mat:
        #Identify the gender of the user
        sex = user_df.loc[user_ID]['Sex']
        
        #Check if the gender has rated the movie
        if sex in gMean[movie_ID]:
            
            #Compute the mean rating given by that gender to the movie
            sex_rating = gMean[movie_ID][sex]
        
        else:
            sex_rating = 3.0
    
    else:
        #Default to a rating of 3.0 in the absence of any information
        sex_rating = 3.0
    
    return sex_rating


In [155]:
score(coll_filter_gMean)

1.0330308800874282

In [158]:
#Compute the mean rating by gender and occupation
goMean = comb_df[['Sex', 'Rating', 'Movie ID', 'Occupation']].pivot_table(
    values='Rating', index='Movie ID', columns=['Occupation', 'Sex'], aggfunc='mean')

goMean.head()


Occupation,administrator,administrator,artist,artist,doctor,educator,educator,engineer,engineer,entertainment,...,salesman,salesman,scientist,scientist,student,student,technician,technician,writer,writer
Sex,F,M,F,M,M,F,M,F,M,F,...,F,M,F,M,F,M,F,M,F,M
Movie ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,4.0,4.222222,4.25,3.5,3.666667,3.5,3.923077,4.0,3.970588,5.0,...,4.0,4.0,3.5,3.888889,3.833333,3.709091,4.0,4.2,4.166667,3.142857
2,3.0,3.75,,,,,3.25,,3.363636,,...,,,,,2.333333,3.333333,,2.714286,5.0,2.666667
3,3.5,2.5,,,,4.0,2.5,,3.625,,...,,1.0,,,2.0,3.217391,,4.0,,1.0
4,3.0,3.888889,,4.666667,3.0,2.75,3.636364,,3.555556,,...,4.0,3.666667,,3.6,3.285714,3.724138,,3.2,4.25,3.5
5,4.0,2.333333,,,,4.0,1.5,,2.666667,,...,,,,3.5,4.333333,3.272727,,3.333333,4.0,2.666667


In [159]:
#Gender and Occupation Based Collaborative Filter using Mean Ratings
def coll_filter_goMean(user_ID, movie_ID):
    
    #Check if movie_id exists in gen_occ_mean
    if movie_ID in gen_occ_mean.index:
        
        #Identify the user
        u = user_df.loc[user_ID]
        
        #Identify the gender and occupation
        sex = u['Sex']
        occupation = u['Occupation']
        
        #Check if the occupation has rated the movie
        if occupation in goMean.loc[movie_ID]:
            
            #Check if the gender has rated the movie
            if sex in goMean.loc[movie_ID][occupation]:
                
                #Extract the required rating
                req_rating = goMean.loc[movie_ID][occupation][sex]
                
                #Default to 3.0 if the rating is null
                if np.isnan(req_rating):
                    req_rating = 3.0
                
                return req_rating
            
    #Return the default rating    
    return 3.0


In [160]:
score(coll_filter_goMean)

1.1391976012043645

# Recommending Movie

In [185]:
data = pd.merge(rating_df,movie_df,on='Movie ID')
data.head()

Unnamed: 0,User ID,Movie ID,Rating,Title
0,196,242,3,Kolya (1996)
1,63,242,3,Kolya (1996)
2,226,242,5,Kolya (1996)
3,154,242,3,Kolya (1996)
4,306,242,5,Kolya (1996)


In [186]:
merged_mov_rating = data.dropna(axis = 0, subset = ['Title'])
movie_rc = (merged_mov_rating.groupby(by = ['Title'])['Rating'].count().reset_index().
            rename(columns = {'Rating': 'rating_count'})[['Title', 'rating_count']])
movie_rc.head()

Unnamed: 0,Title,rating_count
0,'Til There Was You (1997),9
1,1-900 (1994),5
2,101 Dalmatians (1996),109
3,12 Angry Men (1957),125
4,187 (1997),41


In [187]:
rat_totalRC = merged_mov_rating.merge(movie_rc, left_on = 'Title', right_on = 'Title', how = 'left')
rat_totalRC.head()

Unnamed: 0,User ID,Movie ID,Rating,Title,rating_count
0,196,242,3,Kolya (1996),117
1,63,242,3,Kolya (1996),117
2,226,242,5,Kolya (1996),117
3,154,242,3,Kolya (1996),117
4,306,242,5,Kolya (1996),117


In [188]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(movie_rc['rating_count'].describe())

count   1664.000
mean      60.096
std       80.956
min        1.000
25%        7.000
50%       27.000
75%       80.250
max      583.000
Name: rating_count, dtype: float64


In [190]:
pop_threshold = 50
popular_mov_rating = rat_totalRC.query('rating_count >= @pop_threshold')
popular_mov_rating.head()

Unnamed: 0,User ID,Movie ID,Rating,Title,rating_count
0,196,242,3,Kolya (1996),117
1,63,242,3,Kolya (1996),117
2,226,242,5,Kolya (1996),117
3,154,242,3,Kolya (1996),117
4,306,242,5,Kolya (1996),117


In [200]:
popular_mov_rating.shape

(84069, 5)

In [201]:
## First lets create a Pivot matrix

movF =popular_mov_rating.pivot_table(index='Title',columns='User ID',values='Rating').fillna(0)
movF.head()

User ID,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
101 Dalmatians (1996),2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,2.0,4.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),5.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2 Days in the Valley (1996),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
"20,000 Leagues Under the Sea (1954)",3.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey (1968),4.0,0.0,0.0,0.0,4.0,5.0,5.0,0.0,0.0,5.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0


In [202]:
from scipy.sparse import csr_matrix

movF_df = csr_matrix(movF.values)

from sklearn.neighbors import NearestNeighbors


model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(movF_df)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [203]:
movF.shape

(605, 943)

In [206]:
q_ind = np.random.choice(movF.shape[0])
print("Query index:", q_ind)
dist, idx = model_knn.kneighbors(movF.iloc[q_ind,:].values.reshape(1, -1), n_neighbors = 6)

Query index: 464


In [207]:
movF.head()

User ID,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
101 Dalmatians (1996),2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,2.0,4.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),5.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2 Days in the Valley (1996),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
"20,000 Leagues Under the Sea (1954)",3.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey (1968),4.0,0.0,0.0,0.0,4.0,5.0,5.0,0.0,0.0,5.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0


In [208]:
for i in range(0, len(dist.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(movF.index[q_ind]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, movF.index[idx.flatten()[i]], dist.flatten()[i]))


Recommendations for Rosewood (1997):

1: Hoodlum (1997), with distance of 0.6337342018757264:
2: Air Force One (1997), with distance of 0.638853586573177:
3: Soul Food (1997), with distance of 0.639169273997501:
4: Murder at 1600 (1997), with distance of 0.6422303006460974:
5: G.I. Jane (1997), with distance of 0.6423477175159895:


In [209]:
#pip install surprise

# Accuracy

In [210]:
#Splitting is already done
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [211]:
#testing the dataset
y_pred = knn.predict(X_test)
y_pred

array([239, 536, 177, ..., 198, 871, 233], dtype=int64)

In [213]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy score:",accuracy*100)

Accuracy score: 43.88
