In [210]:
import numpy as np
from numpy import nan
import pandas as pd

In [40]:
def loadData():
    data = np.genfromtxt('small-dataset.csv', delimiter=',',dtype=None)
    dt = np.zeros([data.shape[0],len(data[0])-1])
    for i in range(0,data.shape[0]):
        temp = list(data[i])
        dt[i, :] = temp[1:]
    print(dt)
    return dt

In [227]:
data = loadData()

[[ 7.  6.  7.  4.  5.  4.]
 [ 6.  7.  0.  4.  3.  4.]
 [ 0.  3.  3.  1.  1.  0.]
 [ 1.  2.  2.  3.  3.  4.]
 [ 1.  0.  1.  2.  3.  3.]]


In [221]:
def user_sim_cosine_sim(person1, person2):
    idx1  = np.where(person1!=0)
    idx2  = np.where(person2!=0)
    idxs=list(set(idx1[0]) & set(idx2[0]))
    A = np.dot(person1[idxs], person2[idxs])
    B  = np.linalg.norm(person1[idxs]) * np.linalg.norm(person2[idxs])
    cos_dist = A/B
    if np.isnan(cos_dist):
        return 0
    else:
        return cos_dist

In [76]:
#cosiine between user2 and user4
user_sim_cosine_sim(data[2, :], data[4,:])

0.64465837122030423

In [220]:
def user_sim_pearson_corr(person1, person2):
    m1 = np.mean(person1[person1!=0])
    m2 = np.mean(person2[person2!=0])
    idx1  = np.where(person1!=0)
    idx2  = np.where(person2!=0)
    idxs=list(set(idx1[0]) & set(idx2[0]))
    n1 = person1[idxs]-m1
    n2 = person2[idxs]-m2
    A = np.dot(n1, n2)
    B  = np.linalg.norm(n1) * np.linalg.norm(n2)
    corr = A/B
    #print(corr)
    if np.isnan(corr):
        #print('hey')
        return 0
    else:
        return corr

In [228]:
#correlation between user0 and user2
user_sim_pearson_corr(data[0, :], data[2, :])

0.89442719099991586

In [223]:
def most_similar_users(person, db, number_of_users):
    similarities  = []
    for i in range(0, db.shape[0]):
        cos = user_sim_cosine_sim(db[person, :], db[i,:])
        cor = user_sim_pearson_corr(db[person, :], db[i, :])
        similarities.append((cos+cor)/2)
    similarities = np.array(similarities)
    sorts = np.argsort(similarities)
    sorts = list(sorts)[::-1]
    msu = sorts.remove(person)
    msu = sorts[:number_of_users]
    return msu

In [229]:
#3 most similar users with 1
most_similar_users(1, data, 3)

[2, 0, 3]

In [230]:
#added threshold (if estimated rating is better than this threshold, then recommend it)

def user_recommendations(person, db, thres):
    msu = most_similar_users(person, db, 2)
    #print(msu)
    prob_it = np.where(db[person, :]==0)
    items = []
    if len(prob_it[0])!=0:
        for i in prob_it[0]:
            if (db[msu[0], i]==0) or (db[msu[1], i]==0):
                continue
            else:
                temp = (db[msu[0], i]*user_sim_pearson_corr(db[msu[0], :], db[person, :])+\
                            db[msu[1], i]*user_sim_pearson_corr(db[msu[1], :], db[person, :]))/\
                    (user_sim_pearson_corr(db[msu[0], :], db[person, :])+\
                     user_sim_pearson_corr(db[msu[1], :], db[person, :]))
                #print(user_sim_pearson_corr(db[msu[0], :], db[person, :]))
                #print(user_sim_pearson_corr(db[msu[1], :], db[person, :]))
                      
                if temp>=thres:
                    items.append(i)
    return items

In [231]:
#recommend to user1 something with threshold 4
user_recommendations(1, data, 4)

[2]

In [139]:
#normal dataset

In [232]:
#reading, processing
df=pd.read_csv("C:\\Users\\Rita\\Documents\\ml-latest-small\\ratings.csv")
df.drop('timestamp', axis = 1, inplace = True)

In [175]:
df.head()
df.iloc[82388]

userId       561.0
movieId    35836.0
rating         4.5
Name: 82388, dtype: float64

In [180]:
#making the data suitable

def preprocessing(df):
    sh0 = len(pd.unique(df['userId']))
    print(sh0)
    sh1 = df['movieId'].max(axis=0)
    print(sh1)
    db = np.zeros([sh0, sh1])
    for i in df.values:
        #print(i)
        db[i[0]-1, i[1]-1] = i[2]
    return db

In [184]:
#create database
db = preprocessing(df)

671
163949




In [226]:
#recommend to user0 something

user_recommendations(0, db, 2)

[933]