In [18]:
# coding: utf-8

# Recommendation systems
# A content-based recommendation algorithm.

from collections import Counter, defaultdict
import math
import numpy as np
import os
import pandas as pd
import re
from scipy.sparse import csr_matrix
import urllib.request
import zipfile

import nltk
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity

def tokenize_string(my_string):
    """ 
	Tokenize function.
    """
    return re.findall('[\w\-]+', my_string.lower())


def tokenize(movies):
    """
    Append a new column to the movies DataFrame with header 'tokens'.
    This will contain a list of strings, one per token, extracted
    from the 'genre' field of each movie.
    Params:
      movies...The movies DataFrame
    Returns:
      The movies DataFrame, augmented to include a new column called 'tokens'.
    """
    ###TODO

    movies['tokens'] = [tokenize_string(genre) for genre in movies['netflix_genres']]

    return movies


def featurize(movies):
    """
    Append a new column to the movies DataFrame with header 'features'.
    Each row will contain a csr_matrix of shape (1, num_features). Each
    entry in this matrix will contain the tf-idf value of the term.
    tfidf(i, d) := tf(i, d) / max_k tf(k, d) * log10(N/df(i))
    where:
    i is a term
    d is a document (movie)
    tf(i, d) is the frequency of term i in document d
    max_k tf(k, d) is the maximum frequency of any term in document d
    N is the number of documents (movies)
    df(i) is the number of unique documents containing term i
    Params:
      movies...The movies DataFrame
    Returns:
      A tuple containing:
      - The movies DataFrame, which has been modified to include a column named 'features'.
      - The vocab, a dict from term to int.
    """
    ###TODO

    #creating a vocab of all the unique genres
    vocab = {movie_tokens:idx for idx, movie_tokens in enumerate(sorted(np.unique(np.concatenate(movies.tokens))))}

    # creating df
    df = defaultdict(int)
    for movie_genre in movies.tokens:
        for genre in vocab:
            if genre in movie_genre:
                df[genre]+=1


    #print(sorted(df.items(), key = lambda x: -x[1]))

    #for every movie how many times the genre appears

    all_csr = []
    for idx, movie in enumerate(movies.tokens):
        #print(movie)
        colmn, data, row = [], [], []
        tf = Counter(movie)     # tf
        max_k = tf.most_common(1)[0][1]
        #print(max_k)# max_k
        for genre, freq in tf.items():
            if genre in vocab:
                #row.append(0)
                colmn.append(vocab[genre])
                data.append((freq/max_k)*math.log10(len(movies)/df[genre])) # tf-idf
                X = csr_matrix((np.asarray(data), (np.zeros(shape=(len(data))), np.asarray(colmn))), shape=(1, len(vocab)))

        all_csr.append(X)

    movies['features'] = all_csr

    #print(movies['features'])

    #print(movies.features.head())


    return movies, vocab
    #pass


def train_test_split(ratings):
    """
    Returns a random split of the ratings matrix into a training and testing set.
    """
    test = set(range(len(ratings))[::1000])
    train = sorted(set(range(len(ratings))) - test)
    test = sorted(test)
    return ratings.iloc[train], ratings.iloc[test]



def make_predictions(movies, ratings_train, ratings_test):
    """
    Using the ratings in ratings_train, predict the ratings for each
    row in ratings_test.
    To predict the rating of user u for movie i: Compute the weighted average
    rating for every other movie that u has rated.  Restrict this weighted
    average to movies that have a positive cosine similarity with movie
    i. The weight for movie m corresponds to the cosine similarity between m
    and i.
    If there are no other movies with positive cosine similarity to use in the
    prediction, use the mean rating of the target user in ratings_train as the
    prediction.
    Params:
      movies..........The movies DataFrame.
      ratings_train...The subset of ratings used for making predictions. These are the "historical" data.
      ratings_test....The subset of ratings that need to predicted. These are the "future" data.
    Returns:
      A numpy array containing one predicted rating for each element of ratings_test.
    """
    ###TODO

    # for every user in Test Set, get the rating from the Train Set
    predictions = []
    for row in ratings_test.itertuples():
        # got the test userid & test movie_id
        #print("Getting for", test_userid, test_movied)
        test_userid = getattr(row, 'user_id')
        test_movie_id = getattr(row, 'movie_id')
        weight_ratings = []
        weights = []
        target_user_ratings = []
        for row in ratings_train.loc[ratings_train.user_id == test_userid, 'movie_id': 'rating'].itertuples():
            # got the ratings and movie_id for the test userId
            # print(rating_val.movie_id, rating_val.rating)
            # print(int(train_user.movie_id), int(test_movie_id))
            # print(movies.loc[movies.movie_id == int(train_user.movie_id)].features.values)
            # print(movies.loc[movies.movie_id == int(test_movie_id)].features.values)

            movie_id = getattr(row, 'movie_id')
            rating= getattr(row, 'rating')

            cos_sim_weight = cosine_similarity(movies.loc[movies.movie_id == int(movie_id)].features.values[0].toarray(),
                                        movies.loc[movies.movie_id == int(test_movie_id)].features.values[0].toarray())
            #print(cos_sim_weight)
            weight_ratings.append(rating * cos_sim_weight)
            weights.append(cos_sim_weight)
            target_user_ratings.append(rating)


        if np.count_nonzero(weights) > 0:
            #weighted_average = np.sum(weight_ratings)/np.sum(weights)
            predictions.append(np.sum(weight_ratings)/np.sum(weights))
            #print(np.sum(weights))
            #print(weighted_average)
        else:
            predictions.append(ratings_train.loc[ratings_train.user_id == test_userid, 'rating'].mean())
            #predictions.append(np.mean(target_user_ratings))

            #print(ratings_train.loc[ratings_train.user_id == test_userid, 'rating'].mean())



    return np.asarray(predictions)



def mean_absolute_error(predictions, ratings_test):
    """
    Return the mean absolute error of the predictions.
    """
    return np.abs(predictions - np.array(ratings_test.rating)).mean()
    

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# To use it in colab use this code to import the data. 
# Its necessarc to complet the following steps in advance: 
# 1. Create a folder in your drive with the Name DMC. 
# 2. Go to the shard folder 02 Data and do a right click -> Add shortcut to Drive / Drve Verbindung hinzufügen -> dann auf den eben erstellen order DMC navigieren
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [3]:
rawTrain = pd.read_csv("/content/drive/My Drive/Web Mining Project/NEW_DATA/Truncated/train_data_ratings.csv", delimiter=';')
rawTest = pd.read_csv("/content/drive/My Drive/Web Mining Project/NEW_DATA/Truncated/test_data_ratings.csv", delimiter=';')
rawTrain.drop(['Unnamed: 0'],1,inplace=True)
rawTest.drop(['Unnamed: 0'],1,inplace=True)

In [5]:
#Read Extended Data

movies = pd.read_csv("/content/drive/My Drive/Web Mining Project/NEW_DATA/final_movies_metadata_linked.csv", delimiter=';')
movies.drop(['Unnamed: 0'], axis=1, inplace=True)
movies.head(1)

Unnamed: 0,netflix_id,title,release_year,metadata_id,imdb_id,original_language,overview,tagline,metadata_genres,netflix_genres
0,402,Pan Tadeusz,1999,4966,tt0170351,pl,A grand and patriotic tale of Poland's struggl...,,"[{'id': 10752, 'name': 'War'}, {'id': 18, 'nam...",Drama|History|Romance|War


In [6]:
movies['netflix_genres']

0             Drama|History|Romance|War
1                    Action|Crime|Drama
2                          Comedy|Crime
3                       Biography|Drama
4       Animation|Comedy|Family|Musical
                     ...               
8063               Comedy|Drama|Romance
8064      Drama|Fantasy|History|Mystery
8065               Comedy|Drama|Romance
8066        Comedy|Crime|Drama|Thriller
8067       Action|Comedy|Crime|Thriller
Name: netflix_genres, Length: 8068, dtype: object

In [7]:
lemmatizer = WordNetLemmatizer()

def lemmatize(words):
    a = words.split(" ")
    res = []
    for i in a: 
        res.append(lemmatizer.lemmatize(i))
    return " ".join(res)

def make_lower_case(text):
    return text.lower()

def remove_stop_words(text):
    text = text.split()
    stops1 = set(stopwords.words("english"))
    text = [w for w in text if not w in stops1]
    
    stops2 = set(stopwords.words("german"))
    text = [w for w in text if not w in stops2]

    stops3 = set(stopwords.words("french"))
    text = [w for w in text if not w in stops3]
    text = " ".join(text)
    
    return text

def remove_punctuation(sentence): 
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    new_words = tokenizer.tokenize(sentence)

    return " ".join(new_words)

In [8]:
movies['netflix_genres'] = movies['netflix_genres'].astype(str)

In [20]:
movies['netflix_genres'] = movies['netflix_genres'].map(lambda x: x.replace("|", " "))
movies['netflix_genres'] = movies['netflix_genres'].apply(remove_punctuation)
movies['netflix_genres'] = movies['netflix_genres'].apply(make_lower_case)
movies['netflix_genres'] = movies['netflix_genres'].apply(lambda x: ' '.join(x.split()))
movies['netflix_genres'] = movies['netflix_genres'].apply(lemmatize)
movies['netflix_genres'] = movies['netflix_genres'].apply(remove_stop_words)

In [9]:
movies['movie_id'] = movies['netflix_id']

In [10]:
movies = tokenize(movies)
movies, vocab = featurize(movies)
print('vocab:')
print(sorted(vocab.items())[:10])

vocab:
[('action', 0), ('adventure', 1), ('animation', 2), ('biography', 3), ('comedy', 4), ('crime', 5), ('documentary', 6), ('drama', 7), ('family', 8), ('fantasy', 9)]


In [11]:

print('%d training ratings; %d testing ratings' % (len(rawTrain), len(rawTest)))


3199119 training ratings; 1066374 testing ratings


In [30]:
rawTest = rawTest.sort_values('user_id')

In [12]:
rawTest.head(10)

Unnamed: 0,user_id,movie_id,rating
0,2059086,758,3
1,697601,1975,2
2,396326,1406,3
3,2254919,571,4
4,554424,257,5
5,1093947,1144,1
6,649585,919,1
7,1906979,1518,2
8,353309,334,3
9,1954166,312,3


In [19]:
predictions = make_predictions(movies, rawTrain, rawTest.head(1000))

In [35]:
predictions

array([3.3953991 , 3.55252652, 3.29428866, 3.40172428, 3.40172428,
       3.32541796, 3.46749413, 3.51621897, 3.56020053, 3.28497504])

In [None]:
print('error=%f' % mean_absolute_error(predictions, rawTest))
print(predictions[:10])