# **Dataset acquisition and cleaning**

In [0]:
#import packages required
import pandas as pd
import numpy as np
from sklearn import linear_model
import sklearn.metrics as metrics
import matplotlib.pyplot as plt

In [2]:
#open the data file as a dataframe
movies=pd.read_csv('https://github.com/ArinB/CA05-kNN/raw/master/movies_recommendation_data.csv')

#show the first several rows
movies.head()

Unnamed: 0,Movie ID,Movie Name,IMDB Rating,Biography,Drama,Thriller,Comedy,Crime,Mystery,History,Label
0,58,The Imitation Game,8.0,1,1,1,0,0,0,0,0
1,8,Ex Machina,7.7,0,1,0,0,0,1,0,0
2,46,A Beautiful Mind,8.2,1,1,0,0,0,0,0,0
3,62,Good Will Hunting,8.3,0,1,0,0,0,0,0,0
4,97,Forrest Gump,8.8,0,1,0,0,0,0,0,0


In [3]:
#Perform a Data Quality Analysis to find missing values, outliers, NaNs etc.
print(movies.isnull().sum())
print(movies.info())

movies.describe()
#no missing values like NaN is dectected

Movie ID       0
Movie Name     0
IMDB Rating    0
Biography      0
Drama          0
Thriller       0
Comedy         0
Crime          0
Mystery        0
History        0
Label          0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 11 columns):
Movie ID       30 non-null int64
Movie Name     30 non-null object
IMDB Rating    30 non-null float64
Biography      30 non-null int64
Drama          30 non-null int64
Thriller       30 non-null int64
Comedy         30 non-null int64
Crime          30 non-null int64
Mystery        30 non-null int64
History        30 non-null int64
Label          30 non-null int64
dtypes: float64(1), int64(9), object(1)
memory usage: 2.7+ KB
None


Unnamed: 0,Movie ID,IMDB Rating,Biography,Drama,Thriller,Comedy,Crime,Mystery,History,Label
count,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
mean,48.133333,7.696667,0.233333,0.6,0.1,0.1,0.133333,0.1,0.1,0.0
std,29.288969,0.666169,0.430183,0.498273,0.305129,0.305129,0.345746,0.305129,0.305129,0.0
min,1.0,5.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,27.75,7.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,48.5,7.75,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,64.25,8.175,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,98.0,8.8,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [4]:
#for such a small dataset,(n<=30,D<20) we need to use the Bruce Force method
#Euclidean distance is being used, then we have to consider the range for each column
#imdb rating varies from 5.9 to 8.8, has a range of 2.9 
#while the others genara ranges from 0 to 1, range of 1
#plus genras are closer to the definition of "similar movies" recommendation
#we would devide the rating by 10, thus the range would only be 0.29 
#then each genra would carry more weight then imdb rating


#label column is also useless
movies_new=movies.copy()[[ 'Biography', 'Drama','Thriller','Comedy',
              'Crime', 'Mystery', 'History']]
movies_new['rating_modified']=movies.copy()['IMDB Rating']/10
movies_new.head()

Unnamed: 0,Biography,Drama,Thriller,Comedy,Crime,Mystery,History,rating_modified
0,1,1,1,0,0,0,0,0.8
1,0,1,0,0,0,1,0,0.77
2,1,1,0,0,0,0,0,0.82
3,0,1,0,0,0,0,0,0.83
4,0,1,0,0,0,0,0,0.88


# **Building model, finding movies**

In [0]:
#we only need to find out 5 similar movies to the target movies
#not classifier or regressor needed
#defining euclidean distance as metric to find most similar movies

def euclidean_distance(point_1, point_2):
  sum_squared_distance = 0
  for i in range(len(point_1)):
    sum_squared_distance += (point_1[i]-point_2[i])**2
  return sum_squared_distance**0.5

#set up target movie array
the_post = (1,1,0,0,0,0,1,0.72)


In [6]:
#find out the euclidean_distance for the target movie

distances=[]
index=[]

#calculate and collect distances and movie indexes
for n in range(0,30):
  distance_temp=euclidean_distance(the_post, movies_new.iloc[n,:])
  distances.append(distance_temp)
  index.append(n)

#sort the distances, small to large
similar_movies=pd.Series(distances,index=index).sort_values(ascending=True)[:5]
#trace back to movies dataframe to find movie names
movies.iloc[similar_movies.index,1]

28    12 Years a Slave
27       Hacksaw Ridge
29      Queen of Katwe
16      The Wind Rises
2     A Beautiful Mind
Name: Movie Name, dtype: object