1) Data Preperation
---

1a) Importing Modules

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

from scipy import spatial
import operator, math

In [2]:
import warnings
warnings.filterwarnings('ignore')

1b) Importing and Preparing Dataset from files

In [3]:
# Adidng the rating to the pandas dataframe
file_name = "netflix_titles.csv"
rating_file = "./Scrapper/rating.txt"

# Processign the rating text file
#Slicing based on ":" and accessing first and last index into dict
with open(rating_file) as file:
    data = file.read().split('\n')
    data = [i.split(':') for i in data]
    data_dict = {}
    for i in data:
        if i[-1] not in ['NONE', '[ERROR]', '']:
            data_dict['s'+i[0]] = float(i[-1]) 

# Opening the csv file as pandas data frame
MovieData = pd.read_csv(file_name)

# Dropping values if the ratings do not exist
for index, val in enumerate(MovieData['show_id']):
    if val not in data_dict:
        MovieData = MovieData.drop(index)
#Removed Drop '' from director because no such value


# adding the movie/show ratungs to the dataFrame
MovieData['rating'] = [i for i in data_dict.values()]

#Preview
MovieData.head()

FileNotFoundError: [Errno 2] No such file or directory: './Scrapper/rating.txt'

In [None]:
sb.scatterplot(data=MovieData, x = 'release_year', y = 'rating', hue='rating')

In [None]:
sb.catplot(data=MovieData, x = 'type', y = 'rating', hue='rating')

From the 2 graphs above, it is clear that ratings have no corelation with release year or type and hence they can be ignored.

1c) Cleanup of Dataset

In [None]:
#Removing the non-predictor columns as we will not need it for the prediction later
MovieData.drop(columns=['show_id', 'description', 'country', 'duration', 'date_added', 'release_year', 'type'], inplace=True)

# Removing the rows in which either directors, actors, or genres or both are not mentioned
MovieData.dropna(inplace=True)

# Converting strigns in cast and listed_in and cast columns to lists with strings
MovieData['cast'] = MovieData['cast'].str.split(',')
MovieData['listed_in'] = MovieData['listed_in'].str.split(',')

# Resetting the index of the dataframe after removing the rows 
MovieData.reset_index(inplace=True)
MovieData.drop(columns=['index'], inplace=True)

#Preview
MovieData.head()

In [None]:
ExploratoryData = MovieData.copy()

2) Prep of Dataset
---

2a) Processing Genres

Visualizer for top Genre values

In [None]:
def topVisualizer(series, Title, topN):
    if(isinstance(series.iloc[0],list)):                        #Check if series values are lists
        uniqelist = []                                          #Getting a Series of uniqe entries for 'listed_in' from list entries
        for i in series:
            uniqelist.extend(i)
        toplist = pd.Series(uniqelist).value_counts()[:topN]    #Checking the most popular listed_in in column in ascending order
    else: toplist = series.value_counts()[:topN]                
    toplist.sort_values(ascending=True)

    
    plt.subplots(figsize=(12,10))               #Plot
    plt.title(Title)
    plot = toplist.plot.barh(width=0.9)         #Plot Graph
    for i, value in enumerate(toplist.values):  #Labels
        plot.text(.8, i, value,fontsize=12,color='white',weight='bold')
    plt.show()

In [None]:
topVisualizer(ExploratoryData['listed_in'], 'Top Genres', 10)

Indexer function for top Genre values

In [None]:
def valuesindexer(series, topN=0):
    isList = isinstance(series.iloc[0],list)    #Check if series values are lists
    valueList = []
    for index, values in series.items():
        if(isList):
            if(topN): values = values[:topN-1]  #Limit to only top N Entries
            for value in values:            
                if value not in valueList:
                    valueList.append(value)     #Iterate trough all uniqe enties to build indexed list
        else: 
            value = values
            if value not in valueList:
                valueList.append(value)         #Iterate trough all  enties to build indexed list

    #Reference Series for Genre Indexer
    IndexRef = {}
    for i, value in enumerate(valueList): IndexRef[i] = value
    IndexRef = pd.Series(IndexRef)
    return valueList, IndexRef

In [None]:
valuesindexer(ExploratoryData["listed_in"])[0]

Formatting Dataframe Column with binary list containing indexed entries for Genre values

In [None]:
def binarylistFormatter(check_list, list2):
    binaryList = []                                                                    #Indexed (valuesindexer()[1] for keys) Binary-Formatted list of state if entry is present across columns
    for value in list2:                                                                
        if value in check_list:
            binaryList.append(1)
        else:
            binaryList.append(0)
    return binaryList

def binarylistApplier(dataFrame, colReplaced, colName, topN=0):
    valueslist = valuesindexer(dataFrame[colReplaced], topN)[0]                                #Get indexed list of uniqe values
    dataFrame[colName] = dataFrame[colReplaced].apply(lambda x: binarylistFormatter(x, valueslist))   #Apply format as new column
    dataFrame.drop(columns=colReplaced, inplace=True)                                          #Replace old column


In [None]:
binarylistApplier(ExploratoryData, 'listed_in', 'Genre')
ExploratoryData.head()

2b) Processing Cast

Visualizer for top Vast values

In [None]:
topVisualizer(ExploratoryData['cast'], 'Top Actor', 15)

Formatting Dataframe Column with binary list containing indexed entries for Cast

In [None]:
binarylistApplier(ExploratoryData, 'cast', 'Cast', 5)    # We only need to consider the actors that contributed the most in the movie, The dataset already contains the actors in order of their contribution
ExploratoryData.head()

In [None]:
# Removing rows that do not have the direcots
ExploratoryData[ExploratoryData['director']!='']

2c) Processing Director

Visualizer for top Director values

In [None]:
topVisualizer(ExploratoryData['director'], 'Top Director', 10)

Formatting Dataframe Column with binary list containing indexed entries for Dirctor

In [None]:
binarylistApplier(ExploratoryData, 'director', 'Director')
ExploratoryData.head()

3) KNN Exploration
---

In [None]:
def Similarity(dataFrame, movieId1, movieId2):
    a = dataFrame.iloc[movieId1]
    b = dataFrame.iloc[movieId2]
    
    genresA = a['Genre']
    genresB = b['Genre']

    genreDistance = spatial.distance.cosine(genresA, genresB)
    scoreA = a['Cast']
    scoreB = b['Cast']
    scoreDistance = spatial.distance.cosine(scoreA, scoreB)
    
    directA = a['Director']
    directB = b['Director']
    directDistance = spatial.distance.cosine(directA, directB)

    return genreDistance + directDistance + scoreDistance

In [None]:
def predict_score(dataFrame):
    name = input('Enter a movie title: ').title()
    new_movie = dataFrame[dataFrame['title'].str.contains(name)].iloc[0].to_frame().T

    print('Selected Movie:', new_movie.title.values[0])

    ind = dataFrame.index[dataFrame['title'] == name].tolist()[0]

    def getNeighbors(baseMovie, K):
        distances = []
    
        for index, movie in dataFrame.iterrows():
            index = dataFrame.index[dataFrame['title'] == movie['title']].tolist()[0]
            if movie['title'] != baseMovie['title'].values[0]:
                dist = Similarity(dataFrame, index, ind)
                distances.append((movie['title'], dist))
    
        distances.sort(key=operator.itemgetter(1))
        neighbors = []
    
        for x in range(K):
            neighbors.append(distances[x])
        return neighbors
    
    K = int(math.sqrt(dataFrame.shape[0]))
    avgRating = 0
    neighbors = getNeighbors(new_movie, K)
    
    for neighbor in neighbors:
        neighbor_index = dataFrame.index[dataFrame['title'] == neighbor[0]].tolist()[0]
        avgRating = avgRating+dataFrame.iloc[neighbor_index]['rating']

    avgRating = avgRating/K
    print('The predicted rating for %s is: %f' %(new_movie['title'].values[0],avgRating))
    print('The actual rating for %s is %f' %(new_movie['title'].values[0],new_movie['rating']))

    error = (abs(float(avgRating)-float(new_movie['rating']))/float(new_movie['rating']))*100

    print("calculated error : %0.2f" %(error))

In [None]:
##predict_score(ExploratoryData)

In [None]:
#saved_file_name = 'Table_with_rating.csv'
#df.to_csv(saved_file_name, index = False)

Now we know that cast, director and genre can be used to predict the ratings quite accurately

Now we will test different ML models to test which one is the best for our use case

# Machine Learning

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR 
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

In [None]:
TestDF = MovieData.copy()

Processing the movie dataframe for machine learning

In [None]:
from sklearn import preprocessing


labelEncoder = preprocessing.LabelEncoder()

# Converting the string in the director and title column to float -> Makes no difference to the computer
TestDF['director'] = labelEncoder.fit_transform(TestDF['director'].values)

TestDF['listed_in'] = TestDF['listed_in'].to_list()

# Since the cast has way too many poeple, we are going to select the 4 with the most screen time
TestDF['cast'] = TestDF['cast'].to_list()

# Creatinng new columns for individual cast
for i in range(4):
    TestDF[f'cast_{i+1}'] = ''

for ind, item in enumerate(TestDF['cast']):
    for index, cast_a in enumerate(item[:4]):
        TestDF[f'cast_{index+1}'][ind] = cast_a

TestDF.drop(columns=['cast'], inplace=True)

TestDF['cast_1'] = labelEncoder.fit_transform(TestDF['cast_1'].values)
TestDF['cast_2'] = labelEncoder.fit_transform(TestDF['cast_2'].values)
TestDF['cast_3'] = labelEncoder.fit_transform(TestDF['cast_3'].values)
TestDF['cast_4'] = labelEncoder.fit_transform(TestDF['cast_4'].values)

TestDF.head()

In [None]:
import torch

In [None]:
unique_genre = []

for genre_list in TestDF['listed_in']:
    for genre in genre_list:
        if genre not in unique_genre:
            unique_genre.append(genre)



In [None]:
max_no_listed_in = 0

for ind, genre_list in enumerate(TestDF['listed_in']):
    new_listed_in = []

    for genre in genre_list:
        new_listed_in.append(unique_genre.index(genre)+1)
    max_no_listed_in = len(new_listed_in)
    TestDF['listed_in'][ind] = new_listed_in


for ind, genre_list in enumerate(TestDF['listed_in']):
    while len(genre_list) < 3:
        TestDF['listed_in'][ind].append(0)

In [None]:
#TestDF.to_csv('ModifiedDF.csv')
TestDF.head()

In [None]:
TestDF = TestDF.drop('title', axis = 1)

TestDF['listed_in'] = torch.FloatTensor(TestDF['listed_in'])

In [None]:
#torch.is_tensor(TestDF['director'].iloc[0])
#TestDF
from sklearn.model_selection import train_test_split

NNtrain_set,NNtest_set = train_test_split(TestDF, test_size = 0.2)

#NNtrain_set['listed_in'] = torch.FloatTensor(NNtrain_set['listed_in'])
#NNtest_set['listed_in']  = torch.FloatTensor(NNtest_set['listed_in'])

X_train = NNtrain_set.drop('rating', axis = 1)
Y_train = NNtrain_set['rating']

X_test = NNtest_set.drop('rating', axis = 1)
Y_test = NNtest_set['rating']

X = X_train.values
Y = Y_train.values

X = torch.FloatTensor(X)
Y = torch.LongTensor(Y)

#X

In [None]:
#import torch
import torch.nn as nn
import torch.nn.functional as F

#create a neural network from the pytorch module, 1 input layer, 3 hidden layer, 1 output
class Rate(nn.Module):
    def __init__(self, predictor = 6, h1 = 36, h2 = 96, h3 = 192, h4=95, h5=80, h6 =20 , predict = 10):
        super().__init__()
        self.fc1 = nn.Linear(predictor , h1)
        self.fc2 = nn.Linear(h1, h2)
        self.fc3 = nn.Linear(h2, h3)
        self.fc4 = nn.Linear(h3, h4)
        self.fc5 = nn.Linear(h4, h5)
        self.fc6 = nn.Linear(h5, h6)
        self.out = nn.Linear(h6, predict)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        x = F.relu(self.fc6(x))
        x = self.out(x)
        
        return x
    

rate = Rate()

print(rate)

In [None]:
#Set criterion
criterion = nn.CrossEntropyLoss()

#Choose Optimizer
optimizer = torch.optim.Adam(rate.parameters(), lr=0.01)

In [None]:
epoch = 2500
losses = []

for i in range(epoch):
    #passing inputs through the network
    y_pred = rate.forward(X)
    
     #measure the loss at each iteration
    loss = criterion(y_pred,Y)
   
    #keep track of progress
    losses.append(loss.detach().numpy())
    
    #print every selected interval
    if i%10 == 0:
        print(f'epoch: {i} and loss {loss}')
    
    #optimizing the model/feedback
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [None]:
#Graphing the loss over iterations
plt.plot(range(epoch),losses)
plt.ylabel('loss/error')
plt.xlabel('Epoch')

In [None]:
X_test = X_test.values
Y_test = Y_test.values

X_test = torch.FloatTensor(X_test)
Y_test = torch.LongTensor(Y_test)

In [None]:
with torch.no_grad():
    y_eval = rate.forward(X_test)
    loss = criterion(y_eval, Y_test)

In [None]:
loss

In [None]:
correct = 0

with torch.no_grad():
    for i, data in enumerate(X_test):
        y_val = rate.forward(data)
        
        #print(f'{i+1}.) {str(y_val)}\t {Y_test[i]}\t{y_val.argmax().item()}')
        print(f'{i+1}.) {Y_test[i]}\t{y_val.argmax().item()}')
        
        if y_val.argmax().item() == Y_test[i]: 
            correct += 1

            
    print(f'We got {correct} correct')