# Import

In [88]:
import re 
import pandas as pd
import numpy as np
import sys
import numpy
import itertools
import more_itertools
from matplotlib.pyplot import figure
from matplotlib import pyplot as plt
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial import distance
from numpy.linalg import norm
from scipy.stats import pearsonr
from collections import Counter
from dateutil.parser import parse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
pd.option_context('display.max_rows', None, 'display.max_columns', None)

numpy.set_printoptions(threshold=sys.maxsize)

In [2]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')
genres = ["Action",
"Adventure",
"Animation",
"Children",
"Comedy",
"Crime",
"Documentary",
"Drama",
"Fantasy",
"Film-Noir",
"Horror",
"Musical",
"Mystery",
"Romance",
"Sci-Fi",
"Thriller",
"War",
"Western"]

In [70]:
userIds=ratings['userId'].unique()[::4]

In [4]:
movieId = movies["movieId"].to_list()

In [10]:
itemvectors = np.zeros((9742, 18))
print(itemvectors.shape)

(9742, 18)


In [11]:
for index, row in movies.iterrows():
    for i in range(18):
        if genres[i] not in str(row[2]): 
            continue
        else:
            itemvectors[index][i]=1        

In [12]:
item_vectors_df = pd.DataFrame(data=itemvectors, index=movieId, columns=genres)

# Feature Mining

## Years

In [13]:
movies_years = movies["title"].tolist()

In [14]:
years=[]
movies_wo_years=[]
for title in movies_years:
    d = re.findall('\(\d{4}\)', title)
    movies_wo_years.append(re.sub('\(\d{4}\)', '', title))
    d = re.findall('(\d{4})', str(d))
    if d == []:
        d=['0']
    years.append(d)
years_list=list(itertools.chain.from_iterable(years))
years_list = [ int(x) for x in years_list ]
movies["year"]=years_list
movies['title']=movies_wo_years

In [16]:
Counter(sorted(years_list))

Counter({0: 13,
         1902: 1,
         1903: 1,
         1908: 1,
         1915: 1,
         1916: 4,
         1917: 1,
         1919: 1,
         1920: 2,
         1921: 1,
         1922: 1,
         1923: 4,
         1924: 5,
         1925: 4,
         1926: 5,
         1927: 7,
         1928: 4,
         1929: 4,
         1930: 5,
         1931: 14,
         1932: 9,
         1933: 12,
         1934: 11,
         1935: 13,
         1936: 18,
         1937: 16,
         1938: 15,
         1939: 23,
         1940: 25,
         1941: 18,
         1942: 23,
         1943: 10,
         1944: 16,
         1945: 17,
         1946: 23,
         1947: 20,
         1948: 20,
         1949: 25,
         1950: 21,
         1951: 22,
         1952: 16,
         1953: 30,
         1954: 23,
         1955: 36,
         1956: 30,
         1957: 33,
         1958: 31,
         1959: 37,
         1960: 37,
         1961: 34,
         1962: 40,
         1963: 39,
         1964: 43,
         1965: 

Разделим промежутки по годам:
1й. (0-1979]
2й. 1980-1989
3й. 1990-1999
4й. 2000-2010
5й. 2010-2018

In [17]:
yearvectors = np.zeros((9742, 5))
for index, row in movies.iterrows():
    if (row[3]>0) and (row[3]<1980): 
        yearvectors[index][0]=1
    elif(row[3]>1979) and (row[3]<1990):
        yearvectors[index][1]=1
    elif(row[3]>1989) and (row[3]<2000):
        yearvectors[index][2]=1
    elif(row[3]>1999) and (row[3]<2010):
        yearvectors[index][3]=1
    elif(row[3]>2009):
        yearvectors[index][4]=1

In [18]:
yearvectors_df = pd.DataFrame(data=yearvectors, index=movieId, columns=['<1979', '1980-1989', '1990-1999', '2000-2010', '2010-2018'])

In [19]:
item_vectors_df = item_vectors_df.merge(yearvectors_df, left_index=True, right_index=True)

## TF-IDF

In [97]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['title'])

# Metrics and data split

Будем замерять Precision, Recall и F1, Map

In [23]:
def Precision(relevant, retrieved):
    precision = len(set(relevant) & set(retrieved))/len(retrieved)
    return precision

In [24]:
def Recall(relevant, retrieved):
    recall = len(set(relevant) & set(retrieved))/len(relevant)
    return recall

In [25]:
def F1(relevant, retrieved):
    f1=2*(Precision(relevant, retrieved)*Recall(relevant, retrieved))/(Precision(relevant, retrieved)+Recall(relevant, retrieved))
    return f1

In [73]:
def r_maker(approach, userId):
    if approach=='sim':
        dictt=cosine_similarity_test(userId)
    elif approach=='mink':
        dictt=minkowski_distance_test(userId)
    elif approach=='tfidf':
        dictt=TF_IDF_test(userId)
        
    tp=test_train_spliting(userId)[1]
    r=[]
    for i in dictt.keys():
        if i in tp:
            r.append(1)
        else:
            r.append(0)
    return r

In [27]:
def Precision_at_k(k, approach, userId):
    r = r_maker(approach, userId)  
    r = np.asarray(r)[:k]
    return np.mean(r)

In [28]:
def Avg_Precision(approach, userId):
    r = r_maker(approach, userId) 
    r = np.asarray(r)
    out = [Precision_at_k(k + 1, approach, userId) for k in range(r.size) if r[k]]
    return np.mean(out)

In [67]:
def MAP(approach):
    out=[]
    for i in userIds:
        r = r_maker(approach, i)
        r = np.asarray(r)
        Pr_at_k = [np.mean(r[:k+1]) for k in range(r.size)]
        Avg_Pr=np.mean(Pr_at_k)
        out.append(Avg_Pr)
    return np.mean(out)

In [30]:
def metrics(userId, approach, n):
    a=test_train_spliting(userId)[1]
    b=top_n(approach, n)
    return print('\n', 'Precision:',Precision(a,b), '\n', 'Recall:',Recall(a,b), '\n', 'F1:', F1(a,b))    

In [31]:
def test_train_spliting(userId):
    ratings_UserId=ratings.loc[(ratings['userId'] == userId)] 

    #выбрали 80й перцентиль лучших по рейтингу фильмов, чтобы они обязательно присутствовали в тестовой выборке
    top_ratings_UserId=ratings_UserId.loc[(ratings_UserId['rating'] >= np.percentile(ratings_UserId['rating'].to_numpy(), 80))]
    
    #выберем тестовые данные
    tp=top_ratings_UserId.iloc[::3]['movieId'].tolist()
    top_train=top_ratings_UserId.loc[~top_ratings_UserId['movieId'].isin(tp)]['movieId'].tolist()
    fn_fp=ratings_UserId.loc[~ratings_UserId['movieId'].isin(tp+top_train)].iloc[::3]['movieId'].tolist()
    test_ids=tp+fn_fp
    #test_item_vectors_df = item_vectors_df.loc[item_vectors_df.index.isin(test_ids)]

    #выберем на чём будем обучать
    train_ids=ratings_UserId.loc[~ratings_UserId['movieId'].isin(test_ids)]['movieId'].tolist()
    #train_item_vectors_df = item_vectors_df.loc[item_vectors_df.index.isin(train_ids)]
    return test_ids, tp, fn_fp, train_ids

In [32]:
def top_n(approach, n):
    approach=dict(sorted(approach.items(), key=lambda item: item[1], reverse=True))
    if n==0:
        n=len(test_train_spliting(userId)[1])
    top_n = more_itertools.take(n, approach.keys())
    return top_n

# Cosine similarity

Считаем вектор пользователя из трейн датасета 

In [34]:
def cosine_similarity_test(userId):
        test_ids, tp, fn_fp, train_ids=test_train_spliting(userId)
        #юзер-вектор
        train_user_vector=item_vectors_df.iloc[0]*0 #строка из нулей
        train_item_vectors_df=item_vectors_df.loc[train_ids]
        
        for index, row in ratings.loc[ratings['movieId'].isin(train_ids)].iterrows():
               train_user_vector=train_user_vector+train_item_vectors_df.loc[int(row[1])]*int(row[2])
        
        similarity={}
        test_item_vectors_df=item_vectors_df.loc[test_ids]
        for index,row in test_item_vectors_df.iterrows():
            similarity[index]=(cosine_similarity(train_user_vector.to_numpy().reshape(1,-1),row.to_numpy().reshape(1,-1))[0][0])
        return(similarity)

In [35]:
metrics(10, cosine_similarity_test(10), 10)


 Precision: 0.3 
 Recall: 0.17647058823529413 
 F1: 0.22222222222222224


In [71]:
MAP('sim')

0.7134710113187342

# Minkowski distance

In [74]:
def minkowski_distance_test(userId):
        test_ids, tp, fn_fp, train_ids=test_train_spliting(userId)
        #юзер-вектор
        train_user_vector=item_vectors_df.iloc[0]*0 #строка из нулей
        train_item_vectors_df=item_vectors_df.loc[train_ids]
        
        for index, row in ratings.loc[ratings['movieId'].isin(train_ids)].iterrows():
               train_user_vector=train_user_vector+train_item_vectors_df.loc[int(row[1])]*int(row[2])
        
        dist={}
        test_item_vectors_df=item_vectors_df.loc[test_ids]
        for index,row in test_item_vectors_df.iterrows():
            dist[index]=(distance.minkowski(train_user_vector.to_numpy(),row.to_numpy()))
        return(dist)

In [98]:
print(metrics(10, minkowski_distance_test(10), 10))


 Precision: 0.5 
 Recall: 0.29411764705882354 
 F1: 0.37037037037037035
None


Растояние Миньковского помимо самых весомых жанров захватывает ещё второстепенные, поэтому оно получше

In [76]:
MAP('mink')

0.7134710113187342

Но по-видимому в среднем это не играет роли

# TF-IDF

In [77]:
def TF_IDF_test(userId):
    test_ids, tp, fn_fp, train_ids=test_train_spliting(userId)

    train_tfidf_matrix=tfidf_matrix[movies.loc[movies["movieId"].isin(train_ids)].index]
    rating_vector=(ratings.loc[(ratings["movieId"].isin(train_ids))&(ratings['userId']==userId)].sort_values('movieId')['rating']).to_numpy()
    user_vector=0
    for i in range(train_tfidf_matrix.shape[0]):
        user_vector+=rating_vector[i]*train_tfidf_matrix[i]
    sim={}
    for i in movies.loc[movies["movieId"].isin(test_ids)].index:
        vector=tfidf_matrix[i].toarray()
        i=movies.iloc[i]['movieId']
        sim[i]=(cosine_similarity(user_vector,vector)[0][0])
    return sim

In [78]:
print(metrics(10, TF_IDF_test(10), 10))


 Precision: 0.5 
 Recall: 0.29411764705882354 
 F1: 0.37037037037037035
None


In [79]:
MAP('tfidf')

0.40030665570747465

# Results

In [89]:
def cosine_similarity_check(userId):
        train_ids = ratings.loc[ratings['userId']==userId]['movieId']
        test_ids = movies.loc[~movies['movieId'].isin(train_ids)]['movieId']
        #юзер-вектор
        train_user_vector=item_vectors_df.iloc[0]*0 #строка из нулей
        train_item_vectors_df=item_vectors_df.loc[train_ids]
        
        for index, row in ratings.loc[ratings['movieId'].isin(train_ids)].iterrows():
               train_user_vector=train_user_vector+train_item_vectors_df.loc[int(row[1])]*int(row[2])
        
        similarity={}
        test_item_vectors_df=item_vectors_df.loc[test_ids]
        for index,row in test_item_vectors_df.iterrows():
            similarity[index]=(cosine_similarity(train_user_vector.to_numpy().reshape(1,-1),row.to_numpy().reshape(1,-1))[0][0])
        return(similarity)

In [91]:
movies.loc[movies['movieId'].isin(top_n(cosine_similarity_check(10), 10))]

Unnamed: 0,movieId,title,genres,year
2605,3481,High Fidelity,Comedy|Drama|Romance,2000
2642,3536,Keeping the Faith,Comedy|Drama|Romance,2000
3460,4719,Osmosis Jones,Action|Animation|Comedy|Crime|Drama|Romance|Th...,2001
3526,4818,Extreme Days,Action|Adventure|Comedy|Drama,2001
6094,42015,Casanova,Action|Adventure|Comedy|Drama|Romance,2005
6219,45672,Click,Adventure|Comedy|Drama|Fantasy|Romance,2006
6570,55116,"Hunting Party, The",Action|Adventure|Comedy|Drama|Thriller,2007
6819,61071,"Sisterhood of the Traveling Pants 2, The",Adventure|Comedy|Drama|Romance,2008
7174,72142,Love Exposure (Ai No Mukidashi),Action|Comedy|Drama|Romance,2008
8597,117646,Dragonheart 2: A New Beginning,Action|Adventure|Comedy|Drama|Fantasy|Thriller,2000


In [92]:
def minkowski_distance_check(userId):
        train_ids = ratings.loc[ratings['userId']==userId]['movieId']
        test_ids = movies.loc[~movies['movieId'].isin(train_ids)]['movieId']
        #юзер-вектор
        train_user_vector=item_vectors_df.iloc[0]*0 #строка из нулей
        train_item_vectors_df=item_vectors_df.loc[train_ids]
        
        for index, row in ratings.loc[ratings['movieId'].isin(train_ids)].iterrows():
               train_user_vector=train_user_vector+train_item_vectors_df.loc[int(row[1])]*int(row[2])
        
        dist={}
        test_item_vectors_df=item_vectors_df.loc[test_ids]
        for index,row in test_item_vectors_df.iterrows():
            dist[index]=(distance.minkowski(train_user_vector.to_numpy(),row.to_numpy()))
        return(dist)

In [93]:
movies.loc[movies['movieId'].isin(top_n(minkowski_distance_check(10), 10))]

Unnamed: 0,movieId,title,genres,year
892,1189,"Thin Blue Line, The",Documentary,1988
9091,143410,Hyena Road,(no genres listed),0
9138,147250,The Adventures of Sherlock Holmes and Doctor W...,(no genres listed),0
9259,156605,Paterson,(no genres listed),0
9448,167570,The OA,(no genres listed),0
9514,171495,Cosmos,(no genres listed),0
9515,171631,Maria Bamford: Old Baby,(no genres listed),0
9518,171749,Death Note: Desu nôto (2006–2007),(no genres listed),0
9525,171891,Generation Iron 2,(no genres listed),0
9611,176601,Black Mirror,(no genres listed),0


In [95]:
def TF_IDF_check(userId):
    train_ids = ratings.loc[ratings['userId']==userId]['movieId']
    test_ids = movies.loc[~movies['movieId'].isin(train_ids)]['movieId']
    train_tfidf_matrix=tfidf_matrix[movies.loc[movies["movieId"].isin(train_ids)].index]
    rating_vector=(ratings.loc[(ratings["movieId"].isin(train_ids))&(ratings['userId']==userId)].sort_values('movieId')['rating']).to_numpy()
    user_vector=0
    for i in range(train_tfidf_matrix.shape[0]):
        user_vector+=rating_vector[i]*train_tfidf_matrix[i]
    sim={}
    for i in movies.loc[movies["movieId"].isin(test_ids)].index:
        vector=tfidf_matrix[i].toarray()
        i=movies.iloc[i]['movieId']
        sim[i]=(cosine_similarity(user_vector,vector)[0][0])
    return sim

In [96]:
movies.loc[movies['movieId'].isin(top_n(TF_IDF_check(10), 10))]

Unnamed: 0,movieId,title,genres,year
1130,1477,Love Jones,Romance,1997
1324,1791,Twilight,Crime|Drama|Thriller,1998
1577,2116,"Lord of the Rings, The",Adventure|Animation|Children|Fantasy,1978
3819,5349,Spider-Man,Action|Adventure|Sci-Fi|Thriller,2002
5514,26492,Twilight Zone: The Movie,Fantasy|Horror|Sci-Fi|Thriller,1983
6470,52722,Spider-Man 3,Action|Adventure|Sci-Fi|Thriller|IMAX,2007
6987,67087,"I Love You, Man",Comedy,2009
8032,98124,"Batman: The Dark Knight Returns, Part 1",Action|Animation|Sci-Fi,2012
8080,99813,"Batman: The Dark Knight Returns, Part 2",Action|Animation,2013
9535,172547,Despicable Me 3,Adventure|Animation|Children|Comedy,2017
