## Importing Modules

In [15]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy import spatial
from prepareDF import PrepareDF
import operator
import warnings
warnings.filterwarnings('ignore')

# 1) Data Preperation
---

## Functions to be used


In [16]:
def valuesindexer(series, topN=0):
    isList = isinstance(series.iloc[0],list)    #Check if series values are lists
    valueList = []
    for index, values in series.items():
        if(isList):
            if(topN): values = values[:topN-1]  #Limit to only top N Entries
            for value in values:            
                if value not in valueList:
                    valueList.append(value)     #Iterate trough all uniqe enties to build indexed list
        else: 
            value = values
            if value not in valueList:
                valueList.append(value)         #Iterate trough all  enties to build indexed list

    #Reference Series for Genre Indexer
    IndexRef = {}
    for i, value in enumerate(valueList): IndexRef[i] = value
    IndexRef = pd.Series(IndexRef)
    return valueList, IndexRef

def binarylistFormatter(check_list, list2):
    binaryList = []                                                                    #Indexed (valuesindexer()[1] for keys) Binary-Formatted list of state if entry is present across columns
    for value in list2:                                                                
        if value in check_list:
            binaryList.append(1)
        else:
            binaryList.append(0)
    return binaryList

def binarylistApplier(dataFrame, colReplaced, colName, topN=0):
    valueslist = valuesindexer(dataFrame[colReplaced], topN)[0]                                #Get indexed list of uniqe values
    dataFrame[colName] = dataFrame[colReplaced].apply(lambda x: binarylistFormatter(x, valueslist))   #Apply format as new column
    dataFrame.drop(columns=colReplaced, inplace=True)                                          #Replace old column

## Importing Cleaned and Prepared Dataset

In [17]:
MovieData = PrepareDF()
MovieData.head()

Unnamed: 0,title,director,cast,rating,genres
0,Ganglands,Julien Leclercq,"[Sami Bouajila, Tracy Gotoas, Samuel Jouy, ...",7.2,"[Crime TV Shows, International TV Shows, TV ..."
1,Midnight Mass,Mike Flanagan,"[Kate Siegel, Zach Gilford, Hamish Linklater...",7.7,"[TV Dramas, TV Horror, TV Mysteries]"
2,My Little Pony: A New Generation,"Robert Cullen, José Luis Ucha","[Vanessa Hudgens, Kimiko Glenn, James Marsde...",6.8,[Children & Family Movies]
3,Sankofa,Haile Gerima,"[Kofi Ghanaba, Oyafunmike Ogunlano, Alexandr...",7.0,"[Dramas, Independent Movies, International M..."
4,The Great British Baking Show,Andy Devonshire,"[Mel Giedroyc, Sue Perkins, Mary Berry, Pau...",8.6,"[British TV Shows, Reality TV]"


Formatting Data to Binary List for KNN

In [18]:
binarylistApplier(MovieData, 'genres', 'Genre')
binarylistApplier(MovieData, 'cast', 'Cast', 5)    # We only need to consider the actors that contributed the most in the movie, The dataset already contains the actors in order of their contribution
#MovieData[MovieData['director']!='']
binarylistApplier(MovieData, 'director', 'Director')
MovieData.head()

Unnamed: 0,title,rating,Genre,Cast,Director
0,Ganglands,7.2,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Midnight Mass,7.7,"[0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,My Little Pony: A New Generation,6.8,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Sankofa,7.0,"[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,The Great British Baking Show,8.6,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


# 1) KNN Exploration
---

## Functions to be used


In [19]:
def Similarity(dataFrame, movieId1, movieId2):
    a = dataFrame.iloc[movieId1]
    b = dataFrame.iloc[movieId2]
    
    genresA = a['Genre']
    genresB = b['Genre']

    genreDistance = spatial.distance.cosine(genresA, genresB)
    scoreA = a['Cast']
    scoreB = b['Cast']
    scoreDistance = spatial.distance.cosine(scoreA, scoreB)
    
    directA = a['Director']
    directB = b['Director']
    directDistance = spatial.distance.cosine(directA, directB)

    return genreDistance + directDistance + scoreDistance

def predict_score(dataFrame, show):
    name = show
    new_movie = dataFrame[dataFrame['title'].str.contains(name)].iloc[0].to_frame().T

    ind = dataFrame.index[dataFrame['title'] == name].tolist()[0]

    def getNeighbors(baseMovie, K):
        distances = []
    
        for index, movie in dataFrame.iterrows():
            index = dataFrame.index[dataFrame['title'] == movie['title']].tolist()[0]
            if movie['title'] != baseMovie['title'].values[0]:
                dist = Similarity(dataFrame, index, ind)
                distances.append((movie['title'], dist))
    
        distances.sort(key=operator.itemgetter(1))
        neighbors = []
    
        for x in range(K):
            neighbors.append(distances[x])
        return neighbors
    
    K = 10
    avgRating = 0
    neighbors = getNeighbors(new_movie, K)
    
    for neighbor in neighbors:
        neighbor_index = dataFrame.index[dataFrame['title'] == neighbor[0]].tolist()[0]
        avgRating = avgRating+dataFrame.iloc[neighbor_index]['rating']

    avgRating = avgRating/K
    print('The predicted rating for %s is: %f' %(new_movie['title'].values[0],avgRating))
    print('The actual rating for %s is %f' %(new_movie['title'].values[0],new_movie['rating']))

    error = (abs(float(avgRating)-float(new_movie['rating']))/float(new_movie['rating']))*100

    return error

## Exploration

In [20]:
predict_score(MovieData, 'Ganglands')

The predicted rating for Ganglands is: 6.380000
The actual rating for Ganglands is 7.200000


11.388888888888893

Now we know that cast, director and genre can be used to predict the ratings quite accurately

Now we will test different ML models to test which one is the best for our use case