In [38]:
import numpy as np
import pandas as pd
import math
from sklearn.metrics import mean_squared_error
import random
from math import sqrt
import pymongo
from pymongo import MongoClient
import json

In [39]:
connection = MongoClient('localhost', 27017)
db = connection.recsys

In [40]:
ratings = db.ratings
movies = db.movies
recommendations = db.recommendations

In [41]:
df = pd.DataFrame(list(ratings.find()))
movies = pd.DataFrame(list(movies.find()))

In [42]:
new_df = pd.concat([movies.drop('genres', 1), movies['genres'].str.get_dummies(sep="|")], 1)

In [43]:
new_df.drop(new_df[new_df['(no genres listed)']==1].index,inplace=True)

In [44]:
new_df = new_df.drop(['(no genres listed)'], axis = 1) 

In [45]:
df = pd.merge(df, new_df, on='movieId')

In [46]:
df.columns

Index(['_id_x', 'userId', 'movieId', 'rating', 'timestamp', '_id_y', 'title',
       'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Fantasy', 'FilmNoir', 'Horror', 'IMAX',
       'Musical', 'Mystery', 'Romance', 'SciFi', 'Thriller', 'War', 'Western'],
      dtype='object')

In [47]:
genre_cols = ['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 
       'Documentary', 'Drama', 'Fantasy', 'FilmNoir', 'Horror', 'IMAX', 
       'Musical', 'Mystery', 'Romance', 'SciFi', 'Thriller', 'War', 'Western']

In [48]:
NO_OF_GENRES = 19
NO_OF_FEATURES = 20

In [49]:
gr = np.zeros(NO_OF_GENRES)
gf = np.zeros(NO_OF_GENRES)
rgr = np.zeros(NO_OF_GENRES)
rgf = np.zeros(NO_OF_GENRES)
mrgf = np.zeros(NO_OF_GENRES)

# tr - total_ratings
# gr - genre_rating
# gf - genre_frequency
# rgr - realtive_genre_ratings
# rgf - relative_genre_frequency
# mrgf - modified_relative_genre_frequency
# gim - genre interestingness measure

In [50]:
def total_rating(user):
    tr = 0
    for k in user['rating']:
        tr = tr+k
    return tr

In [51]:
def genre_rating(user):
    """Get genre rating for movies"""
    for i in range(0, NO_OF_GENRES):
        gr[i] = np.dot(user['rating'], user[genre_cols[i]])
    return gr

In [52]:
genre_rating(df[df['userId']==1])

array([389., 373., 136., 191., 355., 196.,   0., 308., 202.,   5.,  59.,
         0., 103.,  75., 112., 169., 228.,  99.,  30.])

In [53]:
def genre_frequency(user):
    """Get genre frequency for all the movies i.e. how many movies have a particular genre."""
    for i in range(0, NO_OF_GENRES):
        gf[i] = user[genre_cols[i]].sum()
    return gf

In [54]:
def relative_genre_rating(gr, tr):
    """Relative Genre Rating= Genre Ratings/ Total Rating"""
    for i in range(0, NO_OF_GENRES):
        rgr[i] = gr[i] / tr
    return rgr

In [55]:
def relative_genre_frequency(gf, tf):
    """Relative Genre Frequency=Genre Frequency/Total Frequency"""
    for i in range(0, NO_OF_GENRES):
        rgf[i] = gf[i] / tf
    return rgf

In [56]:
def add_for_mrgf(movies):
    total = np.zeros(NO_OF_GENRES)
    for i in range(0, NO_OF_GENRES):
        m_t = movies.loc[movies[genre_cols[i]] == 1]
        for j in m_t['rating']:
            total[i] = total[i] + (j - 2)
    return total

In [57]:
def modified_relative_genre_frequency(movies, tf):
    """Modified Relative Genre Frequency = add_for_mrgf/(3*Total Frequency)"""
    added = add_for_mrgf(movies)
    for i in range(0, NO_OF_GENRES):
        mrgf[i] = (added[i]) / (3.0 * tf)
    return mrgf

In [58]:
def gim_final(user_movies):
    """Get GIM of movies of particular user"""

    gim_list = [0] * NO_OF_FEATURES
    tf = user_movies.shape[0]
    # print tf
    tr = 0
    for k in user_movies['rating']:
        tr = tr + k
    movies = user_movies.loc[user_movies['rating'] >= 3.0]

    gr = genre_rating(movies)
    # gf = genre_frequency(movies)
    rgr = relative_genre_rating(gr, tr)
    # rgf = relative_genre_frequency(gf, tf)
    mrgf = modified_relative_genre_frequency(movies, tf)

    nf = 5.0
    for i in range(0, NO_OF_GENRES):
        if((rgr[i] + mrgf[i])==0):
            gim_list[i]=0
        else:
            gim_list[i] = (2 * nf * mrgf[i] * rgr[i]) / (rgr[i] + mrgf[i])
    gim_list = np.nan_to_num(gim_list)
    return gim_list


In [59]:
class GIM:
    """GIM- Genre Interestingness Measure"""

    def __init__(self):
        pass

    def gim_a(self, gim, i):
        """Method to get fuzzy set value for very_bad, bad, average, good."""
        if gim <= i - 2 or gim > i:
            return 0.0
        elif i - 2 < gim <= i - 1:
            return gim - i + 2.0
        elif i - 1 < gim <= i:
            return float(i - gim)

    def very_bad(self, gim):
        if gim <= 1.0:
            return 1.0
        else:
            return 0.0

    def bad(self, gim):
        return self.gim_a(gim, 2.0)

    def average(self, gim):
        return self.gim_a(gim, 3.0)

    def good(self, gim):
        return self.gim_a(gim, 4.0)

    def very_good(self, gim):
        return self.gim_a(gim, 5.0)

    def excellent(self, gim):
        if gim <=4.0:
            return 0.0
        else:
            return (gim-4.0)

    def get_fuzzy_set(self, gim_value):
        """Get fuzzy set of gim(list of values) based on given gim value."""
        return [self.very_bad(gim_value),
                self.bad(gim_value),
                self.average(gim_value),
                self.good(gim_value),
                self.very_good(gim_value),
                self.excellent(gim_value)]

In [60]:
def euclidean_dist(list_a, list_b):
    """Return the Euclidean distance between two array elements."""
    return np.linalg.norm(np.array(list_a) - np.array(list_b))


def fuzzy_dist(first_point, second_point, fuzzy_set_first_point, fuzzy_set_second_point):
    """Returns fuzzy distance between two values and their fuzzy sets."""
    return abs(first_point - second_point) * euclidean_dist(fuzzy_set_first_point, fuzzy_set_second_point)

In [61]:
gim_obj = GIM()
def fuzzy_distance(ui, uj):
    """Returns fuzzy distance between given points."""

    fuzzy_dis = [0] * NO_OF_FEATURES

    # Get fuzzy set values for movie genres
    for i in range(0, NO_OF_GENRES):
        ui_gim = gim_obj.get_fuzzy_set(ui[i])
        uj_gim = gim_obj.get_fuzzy_set(uj[i])
        fuzzy_dis[i] = fuzzy_dist(ui[i], uj[i], ui_gim, uj_gim)

    # adding user_id of second user
#     fuzzy_dis[NO_OF_FEATURES-1] = uj['user_id']
    return fuzzy_dis

In [62]:
def get_neighbours(u1,WEIGHTS):

    # Save active users and its neighbours in a data-frame with active users' id as column name
    user_neighbours= pd.DataFrame(columns = ['userId'])
    NO_OF_NEIGHBOURS=25
    fuzzy_vec = []
    dist = []
    # Iterate over active users model and save neighbours of each active users in user_neighbours
    us1 = df[df['userId']==u1]
    us1 = gim_final(us1)
    j=0
    for u2 in range(1,150):
        if(u1==u2):
            continue
        else:
            us2 = df[df['userId']==u2]
            us2 = gim_final(us2)
            fuzzy_vec = fuzzy_distance(us1,us2)
            fuzzy_gen_dist = np.sum(np.multiply(WEIGHTS, fuzzy_vec)) ** 0.5
            fuzzy_gen_dist = 1/(1+fuzzy_gen_dist)
            dist.append([fuzzy_gen_dist, u2])
            j=j+1
    user_neighbours = [n for n in sorted(dist, reverse=True)][:NO_OF_NEIGHBOURS]
    
    return user_neighbours

In [63]:
def recommend(nearest_neighbours, test_users_data):
    """Recommend rating for given movies i.e. test_examples based on nearest neighbours.

    Also return actual and predicated ratings for testing users
    """
    predicated_rat=[]
    actual_rat = []
    for key, item in test_users_data.iterrows():
        m_id = item['movie_id']
        n_ratings = []
        for i in nearest_neighbours:

            # Get items or movie details reviewed by neighbour i with given m_id
            temp = items_merged.loc[items_merged['user_id'] == i[0]].loc[items_merged['movie_id'] == m_id]
            for k, it in temp.iterrows():
                n_ratings.append(it['rating'])
        predicated_rat.append(float(sum(n_ratings)) / len(n_ratings) if len(n_ratings) else 0)
        actual_rat.append(item['rating'])
    return actual_rat, predicated_rat

In [64]:
def predictRating(user,movie,similar_user):
    n = 0
    predRating = 0
    k = 25
    count=0
    #s = similarUsers(user)
    for i in similar_user:
        rating = df[(df['userId']==i[1]) & (df['movieId'] == movie)]['rating'].values
        rating_average =np.average(df[df['userId']==i[1]]['rating'].values)
        sim = i[0]
        if(len(rating)!=0):
            predRating = predRating + (sim*(rating - rating_average))
            count = count+sim
    if(count!=0):
        predRating = predRating/count
    rating = df[df['userId']==user]['rating'].values
    average = np.average(rating)
    predRating = average+predRating
    if(predRating>5):
        predRating = 5
    return predRating

In [65]:
def getTopMovies1(user,similar_user):
    top_movies = []
#     s = similarUsers(testUser)
    movies = df[(df['userId']==user) & (df['rating']>3)]['movieId'].values
    for j in movies:
        if j not in top_movies:
            top_movies.append(j)
#     for i in similar_user[1]:
#         movies = df[(df['userId']==i) & (df['rating']>4)]['movieId'].values
#         for j in movies:
#             if j not in top_movies:
#                 top_movies.append(j)
    return top_movies

In [66]:
def recommendation1(testUser,similar_user):
    mo = []
    p = []
    u = []
    m = getTopMovies1(testUser,similar_user)
    pr = 50
    if(len(m)<50):
        pr = len(m)
    for j in range(0,pr):
        predRating = predictRating(testUser,m[j],similar_user)
        mo.append(m[j])
        u.append(testUser)
        if np.isscalar(predRating):
            p.append(predRating)
        else:
            p.append(predRating[0])
    d = {'userId':u,'movieId':mo, 'predicted rating': p}
    df_u = pd.DataFrame(d)
    df_u = df_u.nlargest(50, 'predicted rating') 
    return df_u

In [67]:
def fitness_func(weights,testUser):
    w1 = weights[0]
    w2 = weights[1]
    w3 = weights[2]
    w4 = weights[3]
    w5 = weights[4]
    w6 = weights[5]
    w7 = weights[6]
    w8 = weights[7]
    w9 = weights[8]
    w10 = weights[9]
    w11 = weights[10]
    w12 = weights[11]
    w13 = weights[12]
    w14 = weights[13]
    w15 = weights[14]
    w16 = weights[15]
    w17 = weights[16]
    w18 = weights[17]
    w19 = weights[18]
    w20 = weights[19]
    
    w_sum = w1+w2+w3+w4+w5+w6+w7+w8+w9+w10+w11+w12+w13+w14+w15+w16+w17+w18+w19+w20
    
    w1 = w1/w_sum
    w2 = w2/w_sum
    w3 = w3/w_sum
    w4 = w4/w_sum
    w5 = w5/w_sum
    w6 = w6/w_sum
    w7 = w7/w_sum
    w8 = w8/w_sum
    w9 = w9/w_sum
    w10 = w10/w_sum
    w11 = w11/w_sum
    w12 = w12/w_sum
    w13 = w13/w_sum
    w14 = w14/w_sum
    w15 = w15/w_sum
    w16 = w16/w_sum
    w17 = w17/w_sum
    w18 = w18/w_sum
    w19 = w19/w_sum
    w20 = w20/w_sum
    
    w = [w1,w2,w3,w4,w5,w6,w7,w8,w9,w10,w11,w12,w13,w14,w15,w16,w17,w18,w19,w20]
    similar_user = get_neighbours(testUser,w)
    df_u = recommendation1(testUser,similar_user)
    df_err = pd.merge(df,df_u,on=['movieId','userId'])
    if(df_err.shape[0]<1):
        return 100000
    rmse = sqrt(mean_squared_error(df_err['rating'], df_err['predicted rating']))
    return rmse

In [68]:
lb = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
ub = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]

In [69]:
# implementation of PSO
# checking and defining weight inside the function; a random number between 0.5 and 1
def pso(fitness_func, lbound, ubound, u, swarm_size=10, max_iter=10, dimensions=20, c1=1.494, c2=1.494):
    x=[]            # positon of each particle
    x_pbest=[]      # best position of each particle 
    x_gbest=[]      # best global position of population
    v=[]            # velocity of each particle
    v_min=[]
    v_max=[]
    
    # assign initial random positions to the particles
    for i in range(swarm_size):
        x.append([])
        x_pbest.append([])
        for j in range(dimensions):
            x[i].append(random.uniform(lbound[j],ubound[j]))
            x_pbest[i].append(x[i][j])
            
    #calculate initial group best of the population
    for i in range(swarm_size):
        if i==0:
            x_gbest=x[i]
        elif fitness_func(x[i],u)<fitness_func(x_gbest,u):
            x_gbest=x[i]

    
    # calculate minimum and maximum boundaries of velocity vector
    for i in range(dimensions):
        v_min.append(-(ubound[i]-lbound[i])/100)
        v_max.append((ubound[i]-lbound[i])/100)
 
    # assign initial random velocities to the particles
    for i in range(swarm_size):
        v.append([])
        for j in range(dimensions):
            v[i].append(random.uniform(v_min[j],v_max[j]))
            
    for iter in range(max_iter):
        for i in range(swarm_size):
            for j in range(dimensions):
                r1=random.uniform(0,1)
                r2=random.uniform(0,1)
                weight = random.uniform(0.5, 1)
                # calculate new velocity for each particle
                v[i][j] = weight*(v[i][j]) + r1*c1*(x_pbest[i][j]-x[i][j]) + r2*c2*(x_gbest[j]-x[i][j])
                
                if v[i][j] > v_max[j]:
                    v[i][j] = v_max[j]
                
                if v[i][j] < v_min[j]:
                    v[i][j] = v_min[j]
                                 
                # calculate new position for each particle
                x[i][j] = x[i][j] + v[i][j]
                
                if x[i][j] > ubound[j]:
                    x[i][j] = ubound[j]
                    
                if x[i][j] < lbound[j]:
                    x[i][j] = lbound[j]
            
            if fitness_func(x[i],u) < fitness_func(x_pbest[i],u):
                x_pbest[i] = x[i]
                
            if fitness_func(x[i],u) < fitness_func(x_gbest,u):
                x_gbest = x[i]
                
    return x_gbest,fitness_func(x_gbest,u)

In [70]:
def getTopMovies(user,similar_user):
    top_movies = []
    for i in similar_user[1]:
        movies = df[(df['userId']==i) & (df['rating']>4)]['movieId'].values
        for j in movies:
            if j not in top_movies:
                top_movies.append(j)
    return top_movies

In [71]:
def recommendation(testUser,similar_user):
    mo = []
    p = []
    u = []
    m = getTopMovies(testUser,similar_user)
    pr = 50
    if(len(m)<50):
        pr = len(m)
    for j in range(0,pr):
        predRating = predictRating(testUser,m[j],similar_user)
        mo.append(m[j])
        u.append(testUser)
        if np.isscalar(predRating):
            p.append(predRating)
        else:
            p.append(predRating[0])
    d = {'userId':u,'movieId':mo, 'predicted rating': p}
    df_u = pd.DataFrame(d)
    df_u = df_u.nlargest(50, 'predicted rating') 
    return df_u

In [None]:
df_c = pd.DataFrame(columns=['userId','movieId','predicted rating'])
rec = []
r = {}
for testUser in df.userId.unique():
    xopt, fopt = pso(fitness_func, lb, ub,testUser)
    print('error: ',fopt)
    print('pso done')
    w = []
    for i in xopt:
        w.append(i/sum(xopt))
    similar_user = get_neighbours(testUser,w)
    df_u = recommendation(testUser,similar_user)
    df_c = pd.concat([df_u, df_c])
    print('recommendations for user', testUser, ':\n')
    l = 10
    if(len(df_u)<10):
        l = len(df_u['movieId'])
    for i in range(0,l):
        m = movies[movies['movieId']==df_u['movieId'][i]]['title'].values
        print(m[0])
        rec.append(m[0])
        #u.append(testUser)
    r = {'user':int(testUser), 'rec':rec}
    print('\n\n')
    result = recommendations.insert_one(r)
    print('One post: {0}'.format(result.inserted_id))
    df_err = pd.merge(df,df_c,on=['movieId','userId'])
    rmse = sqrt(mean_squared_error(df_err['rating'], df_err['predicted rating']))
    mse = mean_squared_error(df_err['rating'], df_err['predicted rating'])
    print('mse: ',mse)
    print('rmse: ',rmse)
    rec.clear()

error:  0.41792035280395845
pso done
recommendations for user 1 :

Green Mile, The (1999)
Gladiator (2000)
Shawshank Redemption, The (1994)
Dark Knight, The (2008)
Dark Knight Rises, The (2012)
Girl with the Dragon Tattoo, The (2011)
Wolf of Wall Street, The (2013)
Whiplash (2014)
Life Is Beautiful (La Vita Ã¨ bella) (1997)
Lion King, The (1994)



One post: 5e6f3d75b17fa08edcd76856
mse:  0.008146152459366032
rmse:  0.09025603835404052
error:  0.6013526704707901
pso done
recommendations for user 5 :

Braveheart (1995)
Pulp Fiction (1994)
Silence of the Lambs, The (1991)
Pinocchio (1940)
Alice in Wonderland (1951)
Star Wars: Episode V - The Empire Strikes Back (1980)
Clockwork Orange, A (1971)
Goodfellas (1990)
Reservoir Dogs (1992)
Young Frankenstein (1974)



One post: 5e6f4618b17fa08edcd76857
mse:  0.6984808664733076
rmse:  0.8357516775174955
error:  0.7611922697314758
pso done
recommendations for user 7 :

Braveheart (1995)
Shawshank Redemption, The (1994)
Twelve Monkeys (a.k.a. 12 

error:  0.4826694750316008
pso done
recommendations for user 66 :

Heat (1995)
Seven (a.k.a. Se7en) (1995)
Dumb & Dumber (Dumb and Dumber) (1994)
Pulp Fiction (1994)
Tommy Boy (1995)
Schindler's List (1993)
Dances with Wolves (1990)
Silence of the Lambs, The (1991)
Basic Instinct (1992)
Platoon (1986)



One post: 5e7060d7b17fa08edcd7686c
mse:  0.4063302098705661
rmse:  0.6374403578928511
error:  0.6198090437553333
pso done
recommendations for user 68 :

Rob Roy (1995)
Braveheart (1995)
Seven (a.k.a. Se7en) (1995)
Billy Madison (1995)
Tommy Boy (1995)
Fugitive, The (1993)
Clear and Present Danger (1994)
Lion King, The (1994)
GoldenEye (1995)
Cutthroat Island (1995)



One post: 5e7071b9b17fa08edcd7686d
mse:  0.40935843742089995
rmse:  0.6398112514022397
error:  0.7513290601425324
pso done
recommendations for user 71 :

Silence of the Lambs, The (1991)
Star Wars: Episode VI - Return of the Jedi (1983)
Saving Private Ryan (1998)
Matrix, The (1999)
X-Men (2000)
Crouching Tiger, Hidden Dra