In [1]:
!pip install seaborn 
!pip install imdbpy
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import time
from numpy import array
from os.path import exists
import imdb
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from itertools import combinations
from sklearn.metrics.pairwise import linear_kernel
from scipy.sparse.linalg import svds
import math
!pip install nltk
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords



In [2]:
# sorting out arguments
userId = 3223
personalised = True
entry = [{'userId':userId +1, 'movieId':4993, 'rating': 4.5, 'timestamp': 1147880044},
             {'userId':userId +1, 'movieId':7153, 'rating':5.0, 'timestamp': 1147880044},
             {'userId':userId +1, 'movieId':98809, 'rating':4.0, 'timestamp':1147880044},
             {'userId':userId +1, 'movieId':106489, 'rating':5.0, 'timestamp':1147880044}]
entry = []
size = 1000000
test = True
personalised = False

## Get Data

In [3]:
def calcMean(ratings):
    # Calculating mean of ratings 
    meanRating = ratings.groupby('movieId')['rating'].mean()
    countRating = ratings['movieId'].value_counts().sort_index()

    # Combine the series
    df = pd.concat([meanRating, countRating], axis=1).reset_index()
    df = df.rename(columns={"index": "movieId", "movieId": "count"})
    
    # drop movies rated less than 10 times
    df.drop(df[df['count'] < 10].index, inplace = True)
    return df

def makeRatings(ratings,entry,size):
    # reduce dataset to 5m 
    ratings.drop(ratings.index[size:25000095], inplace=True)
    if entry != []:
        
        # new entry for testing
        ratings = ratings.append(entry, ignore_index=True)

    # make all userIds start at 0
    ratings['userId'] = ratings['userId'].apply(lambda x: x-1)
    #print('ratings shape = ',ratings.shape)
    
    df = calcMean(ratings)
    #print('mean ratings shape = ',df.shape)
    
    return ratings, df


ratings, df = makeRatings(pd.read_csv('ml-25m/ratings.csv'),entry,size)

In [4]:
def makeMovies(movies):
    # read and make genres into a list
    movies['genres'] = movies['genres'].apply(lambda x : x.split('|'))
    
    #print('movies shape = ',movies.shape)
    return movies

def getMapping(movies):
    oldmovies = movies['movieId']
    movies = movies.rename(columns={'movieId': 'oldMovieId'})
    
    # remove movies rated less than 10 times and reindex 
    movies.drop(movies[~movies.oldMovieId.isin(df.movieId)].index, inplace=True)
    movies['movieId'] = np.arange(df.shape[0])
    
    # reorders columns
    cols = movies.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    movies = movies[cols]
    mapping = dict(zip(movies['oldMovieId'], movies['movieId']))
    #movies.drop('oldMovieId', inplace=True, axis=1)
    
    #print('movies shape = ',movies.shape)
    return movies, mapping
    
movies = makeMovies(pd.read_csv('ml-25m/movies.csv'))
movies, mapping = getMapping(movies)

In [5]:
# reindex all the tables store as csv for fast completion
if not exists('s' + str(size)+'ratings.csv'):
    ratings.drop(ratings[~ratings.movieId.isin(df.movieId)].index, inplace = True)
    ratings.replace({'movieId': mapping}, inplace = True)
    ratings.to_csv('s' + str(size)+'ratings.csv')
else:
    ratings = pd.read_csv('s' + str(size)+'ratings.csv', index_col=[0])

links = pd.read_csv('ml-25m/genome-tags.csv')

In [6]:
movies.describe()

Unnamed: 0,movieId,oldMovieId
count,7440.0,7440.0
mean,3719.5,34125.117742
std,2147.887334,48601.672617
min,0.0,1.0
25%,1859.75,2508.5
50%,3719.5,5668.5
75%,5579.25,57969.5
max,7439.0,204698.0


In [7]:
ratings.groupby(['userId']).count().rating.sort_values(ascending = False).head(800)


# 2176 - 3221
# 4740 - 1224
# 3402 - 919
# 5440 - 689
# 356 - 578
# 1721 - 419
# 2964 - 248
# 4895 - 158 
# 1140 - 130
# 20 - 103
# 3691 - 97
# 4157 - 84
# 1451 - 70
# 2048 - 65
# 3737 - 59
# 3223 - 52
# 2792 - 25

# 5235 62
# 5173 65
# 1286 67
# 977 73
# 5774 76
# 2608 84

# 5595 117
# 5077 106
# 5757 145
# 1971 162
# 5791 173
# 6327 212
# 4480 198
# 279 300


userId
2176    3221
1747    2862
2981    2376
547     2348
3149    2309
        ... 
279      300
4060     299
4903     299
5119     298
2690     298
Name: rating, Length: 800, dtype: int64

In [8]:
ratings.userId.max()

6746

## SVD

In [9]:
#https://github.com/nikitaa30/Recommender-Systems/blob/master/matrix_factorisation_svd.py

# this is why all the movies are reindexed
ratings_mat = ratings.pivot(
    index='userId',
    columns='movieId',
    values='rating'
).fillna(0)

R = ratings_mat.values
user_ratings_mean = np.mean(R, axis = 1)
R_demeaned = R - user_ratings_mean.reshape(-1, 1)

U, sigma, Vt = svds(R_demeaned, k = 50)

sigma = np.diag(sigma)
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = ratings_mat.columns)

In [10]:
# need to use the prediction score
def recommend_movies(preds_df, userID, movies_df, original_ratings_df, num_recommendations=5):
    user_row_number = userID - 1 # UserID starts at 0
    sorted_user_predictions = preds_df.iloc[user_row_number].sort_values(ascending=False) 
    user_data = original_ratings_df[original_ratings_df.userId == (userID)]
    
    user_full = (user_data.merge(movies_df, how = 'left', left_on = 'movieId', right_on = 'movieId').sort_values(['rating'], ascending=False))
    if test == True:
        # halves the amount of ratings for testing
        user_full = user_full[:-int(user_full.shape[0])//2]
    recommendations = (movies_df[~movies_df['movieId'].isin(user_full['movieId'])]).merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left', left_on = 'movieId',right_on = 'movieId').rename(columns = {user_row_number: 'Predictions'}).sort_values('Predictions', ascending = False).iloc[:num_recommendations, :-1]
    
    return user_full, recommendations, sorted_user_predictions

already_rated, predictions, sorted_user_predictions = recommend_movies(preds_df, userId, movies[['movieId','title']], ratings, 20)


In [11]:
sorted_user_predictions = pd.DataFrame(sorted_user_predictions).reset_index()
sorted_user_predictions['title'] = [movies[movies.movieId == x].title.values[0] for x in sorted_user_predictions['movieId']]
sorted_user_predictions.rename(columns={sorted_user_predictions.columns[1]: "predictedEvaluation"}, inplace = True)
predictions['predictedEvaluation'] = [sorted_user_predictions[sorted_user_predictions.movieId == x].predictedEvaluation.values[0] for x in predictions['movieId']]

In [12]:
# these are the movies used for CBF
testmovies = movies[movies.movieId.isin(predictions.movieId) | movies.movieId.isin(already_rated.movieId)]

## Do TF-IDF

In [13]:
# natural language processing initialisation 

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()
  
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
  
VERB_CODES = {'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'}

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\reggy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\reggy\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\reggy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\reggy\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\reggy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
# this gets the plot outline and finds the 3 most relevant words
def getDescription(row):
    start = time.time()
    try:
        search = ia.get_movie(row[2]).get('plot outline')
        row[2] = search.lower()

        temp_sent =[]
        words = nltk.word_tokenize(row[2])
        tags = nltk.pos_tag(words)
        for i, word in enumerate(words):
            if tags[i][1] in VERB_CODES: 
                lemmatized = lemmatizer.lemmatize(word, 'v')
            else:
                lemmatized = lemmatizer.lemmatize(word)
            if lemmatized not in stop_words and lemmatized.isalpha():
                temp_sent.append(lemmatized)
        finalsent = ' '.join(temp_sent)
        finalsent = finalsent.replace("n't", " not")
        finalsent = finalsent.replace("'m", " am")
        finalsent = finalsent.replace("'s", " is")
        finalsent = finalsent.replace("'re", " are")
        finalsent = finalsent.replace("'ll", " will")
        finalsent = finalsent.replace("'ve", " have")
        finalsent = finalsent.replace("'d", " would")
        row[2] = finalsent.split(' ')[:3]
    except:
        row[2] = []
        
    print(time.time()-start)
    return row

# takes very long to compute when theres lots of predictions so we store as csv
if not exists(str(test) +'u' + str(userId) + 's' + str(size)+'testmovies.csv'):
    # only get info for rated and predicted movies
    movieLinks = pd.read_csv('ml-25m/links.csv')
    movieLinks = movieLinks[movieLinks.movieId.isin(testmovies.oldMovieId)]
    movieLinks.replace({'movieId': mapping}, inplace = True)
    ia = imdb.IMDb()
    movieLinks = movieLinks.astype({'tmdbId':str})
    movieLinks = movieLinks.apply(getDescription, axis = 1)
    movieLinks.rename(columns={"tmdbId": "description"},inplace = True)

In [13]:
# turns the tagIds into tags 
def find(row):
    for x in range(len(row[1])):
        row[1][x] = links.loc[links['tagId'] == row[1][x]].tag.values[0]
    return row

# collects all the tags, genres and descriptive key words
def combine(row):
    tags = taglist.loc[taglist['movieId'] == row[0]].tag.values
    desc = movieLinks.loc[movieLinks['movieId']== row[0]].description.values
    
    # checks to see if theres any movie data
    if len(desc) > 0 and len(tags) > 0:
        # returns as string for some reason so have to fix
        if isinstance(desc[0], str):
            desc[0] = desc[0].strip('][').replace('\'','').split(', ')
        newrow = list(dict.fromkeys(tags[0]+row[3]+desc[0]))
        
    elif len(desc) > 0:
        if isinstance(desc[0], str):
            desc[0] = desc[0].strip('][').replace('\'','').split(', ')
        newrow = list(dict.fromkeys(row[3] + desc[0]))
        
    elif len(tags) > 0:
        newrow = list(dict.fromkeys(tags[0]+row[3]))
        
    else:
        newrow = list(dict.fromkeys(row[3]))
    
    # turns back into a string
    row[3] = ' '.join(newrow).lower()
    return row

# gets the 3 most relevant tags
def getTop3Tags(tags):
    taglist = tags.groupby('movieId', as_index =False).apply(lambda x: x.sort_values(by='relevance',ascending = False)[0:3])
    taglist = taglist.groupby('movieId')['tagId'].apply(list).reset_index()
    taglist = pd.DataFrame(taglist)
    taglist.columns.values[1] = 'tag'
    taglist = taglist.apply(find, axis = 1)
    return taglist

# only get the tags for rated and predicted movies and reindex movieId
if not exists(str(test) + 'u' + str(userId) + 's' + str(size)+'testmovies.csv'):
    tags = pd.read_csv('ml-25m/genome-scores.csv')
    tags = tags[tags.movieId.isin(testmovies.oldMovieId)]
    tags.replace({'movieId': mapping}, inplace = True)
    taglist = getTop3Tags(tags)
    testmovies = testmovies.apply(combine, axis =1)
    testmovies.to_csv(str(test) + 'u' + str(userId) + 's' + str(size)+'testmovies.csv')
    
else:
    testmovies = pd.read_csv(str(test) + 'u' + str(userId) + 's' + str(size)+'testmovies.csv')



In [14]:
# do TFIDF
tf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tf.fit_transform(testmovies['genres'])

# create the cosine similarity matrix
sim_matrix = linear_kernel(tfidf_matrix,tfidf_matrix)

# Done for reference to sim_matrix
testmovies = testmovies.reset_index()
testmovies.drop(testmovies.columns[0], axis =1,inplace = True)

simMetric = []
simMovie = []
i = 0

# calculates similarity from predictions and already rated
for index, row in predictions.iterrows():
    # finds the position of the movie in the sim_matrix
    movieIndex = testmovies[testmovies.movieId == row.movieId].movieId.index.values[0]
    temp = sim_matrix[movieIndex]
    
    # makes the diagonal = 0 so doesnt compare with itself
    temp[movieIndex] = 0
    
    simMetric.append(np.max(temp))
    basedOn = testmovies.iloc[np.argmax(temp)]
    
    # if the movie is most similar to aready rated movie we store it 
    try:
        simMovie.append(already_rated[already_rated.movieId == basedOn.movieId].title.values[0])
    except:
        simMovie.append(np.nan)
    i = i + 1 
    
predictions['similarity'] = simMetric
predictions['basedOn'] = simMovie

## Expert System

In [15]:
# puts the average rating and count in the prediction table
averageRating = calcMean(ratings)
predictions['averageRatings'] = [averageRating[averageRating.movieId == x].rating.values[0] for x in predictions['movieId']]
predictions['count'] = [averageRating[averageRating.movieId == x]['count'].values[0] for x in predictions['movieId']]

In [16]:
# calculates the importance by product of the normalised values
def importance(row):
    return row[3]/predictions['similarity'].max() * row[5]/predictions['averageRatings'].max() * row[6]/predictions['count'].max()
predictions['importance'] = predictions.apply(importance, axis=1)
predictions['finalEval'] = predictions['importance']*predictions['predictedEvaluation']
predictions = predictions.sort_values('finalEval',ascending=False)

In [17]:
# makes the actual predictedEvaluation represent a score /5
meanPred = predictions['predictedEvaluation'].mean()
meanUserRating = ratings[ratings.userId == userId].rating.mean()

# helps to normlalise the rows closer to the average user rating
def normalise(row):
    # mean of our predictions should be the average user rating
    ratio = meanUserRating/meanPred
    row[2] = row[2] * ratio 
    
    # brings it closer to the mean user rating
    if row[2] > meanUserRating:
        row[2] = row[2] - row[2]*ratio*0.1
        
    if row[2] < meanUserRating:
        row[2] = row[2] + row[2]*ratio*0.1
        
    if row[2] > 5:
        row[2] = 5
        
    # round to nearest half 
    row[2] = round(row[2] * 2) / 2
    return row[2]

predictions['predictedEvaluation'] = predictions.apply(normalise, axis = 1)

In [18]:
if test == True:
    
    # gets the test table so can add a new result
    if not exists('s' + str(size)+'testTable.csv'):
        testTable = pd.DataFrame()
    else:
        testTable = pd.read_csv('s' + str(size)+'testTable.csv',index_col=0, names =['userId','numPredictions', 'accuracy', 'meanAverageError','explainability','diversity'] )
        # duplicates the column names for some reason
        testTable = testTable.iloc[1: , :]
        
    ogRatings = ratings[ratings.userId == (userId)]
    evalCount = 0
    mae = 0
    diversity = 0
    numPredictions = ratings.groupby(['userId']).count()[ratings.groupby(['userId']).count().index == userId].rating.values[0]
    explan = predictions["basedOn"].count() / len(predictions)
    avSim = predictions['similarity'].mean()
    
    # for calculation of mae and the amount of correctly guessed movies
    for index, row in predictions.iterrows():
        if row[0] in ogRatings.movieId.unique():
            evalCount +=1
            mae += abs(row[2] - ogRatings[ogRatings.movieId == row[0]].rating.values[0])
        diversity += abs(avSim - row[3])
        
    # averaging some values    
    if evalCount ==0:
        mae = np.nan
        
    else:  
        mae /= evalCount 
    
    evalCount /= 20
    diversity /= 20*avSim
    
    # store to the test table
    row = pd.DataFrame({'userId':userId,'numPredictions':numPredictions,'accuracy':evalCount, 'meanAverageError':mae, 'explainability':explan,'diversity':diversity}, index = [0])
    testTable = pd.concat([testTable,row])
    testTable.to_csv('s' + str(size)+'testTable.csv')
    
'''print('number of correctly predicted movies =' ,evalCount*20)
print('accuracy =',evalCount*100, "%")
print('mean average error of rating predictions = ', mae)
print('percentage of explainable recommendations =',explan *100,'%')
print('diversity score =',diversity*100, "%")
'''

'print(\'number of correctly predicted movies =\' ,evalCount*20)\nprint(\'accuracy =\',evalCount*100, "%")\nprint(\'mean average error of rating predictions = \', mae)\nprint(\'percentage of explainable recommendations =\',explan *100,\'%\')\nprint(\'diversity score =\',diversity*100, "%")\n'

In [19]:
# make non personalised version

# find a user with a lot of ratings 
# get rid of some of their ratings 
# precision = correctly recommended content/ total content
# diversity = sum of(similarity - averagesimilarity) / number of predictions

In [20]:
# diagrams graphs
# accuracy of the rating predictions done by multiplying the average rating with (final eval * something that gives it a max of 5)
# diversity measured by how many different genres
# diversity also can be measured by the similarity metric


# 10 users
# 2176 - 3221
# 4740 - 1224
# 3402 - 919
# 5440 - 689
# 356 - 578
# 1721 - 419
# 2964 - 248
# 1140 - 130 
# 3223 - 52
# 2792 - 25
