In [1]:
import pandas as pd
import numpy as np

In [2]:
!unzip ml-latest-small.zip

Archive:  ml-latest-small.zip
   creating: ml-latest-small/
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  


In [3]:
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')
tags = pd.read_csv('ml-latest-small/tags.csv')

In [4]:
df = pd.merge(ratings, movies, on='movieId' , how='left')
df = df.drop('title', axis=1)
df.head(5)

Unnamed: 0,userId,movieId,rating,timestamp,genres
0,1,1,4.0,964982703,Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Comedy|Romance
2,1,6,4.0,964982224,Action|Crime|Thriller
3,1,47,5.0,964983815,Mystery|Thriller
4,1,50,5.0,964982931,Crime|Mystery|Thriller


In [5]:
df['genres'] = df['genres'].str.split('|')

In [6]:
tags['tag'] = tags['tag'].str.split('|')
tags.drop('timestamp', axis=1, inplace=True)

In [7]:
tags = tags.groupby(['userId','movieId'])['tag'].apply(lambda x: ','.join(x.astype(str))).reset_index()
tags.head(5)

Unnamed: 0,userId,movieId,tag
0,2,60756,"['funny'],['Highly quotable'],['will ferrell']"
1,2,89774,"['Boxing story'],['MMA'],['Tom Hardy']"
2,2,106782,"['drugs'],['Leonardo DiCaprio'],['Martin Scors..."
3,7,48516,['way too long']
4,18,431,"['Al Pacino'],['gangster'],['mafia']"


In [8]:
df = pd.merge(df, tags, on=['userId','movieId'], how='left')

In [9]:
df['tag'] = df['tag'].apply(lambda d: d if isinstance(d, list) else [])
df['genres'] = df['genres'].apply(lambda d: d if isinstance(d, list) else [])

In [10]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(df, test_size=0.2, stratify=df.userId)

In [11]:
train_data = train_data.sort_values(['userId', 'movieId'])
print(train_data)

        userId  movieId  rating   timestamp  \
0            1        1     4.0   964982703   
2            1        6     4.0   964982224   
3            1       47     5.0   964983815   
4            1       50     5.0   964982931   
5            1       70     3.0   964982400   
...        ...      ...     ...         ...   
100830     610   166528     4.0  1493879365   
100831     610   166534     4.0  1493848402   
100832     610   168248     5.0  1493850091   
100833     610   168250     5.0  1494273047   
100834     610   168252     5.0  1493846352   

                                                   genres tag  
0       [Adventure, Animation, Children, Comedy, Fantasy]  []  
2                               [Action, Crime, Thriller]  []  
3                                     [Mystery, Thriller]  []  
4                              [Crime, Mystery, Thriller]  []  
5                      [Action, Comedy, Horror, Thriller]  []  
...                                                

In [12]:
test_data = test_data.sort_values(['userId','movieId'])
train_data.to_csv('training_data.csv', index = False)
test_data.to_csv('testing_data.csv', index = False)

In [13]:
movies['genres'] = movies['genres'].str.split('|')
movies['genres'] = movies['genres'].apply(lambda d: d if isinstance(d, list) else [])
print(movies)

      movieId                                      title  \
0           1                           Toy Story (1995)   
1           2                             Jumanji (1995)   
2           3                    Grumpier Old Men (1995)   
3           4                   Waiting to Exhale (1995)   
4           5         Father of the Bride Part II (1995)   
...       ...                                        ...   
9737   193581  Black Butler: Book of the Atlantic (2017)   
9738   193583               No Game No Life: Zero (2017)   
9739   193585                               Flint (2017)   
9740   193587        Bungo Stray Dogs: Dead Apple (2018)   
9741   193609        Andrew Dice Clay: Dice Rules (1991)   

                                                 genres  
0     [Adventure, Animation, Children, Comedy, Fantasy]  
1                        [Adventure, Children, Fantasy]  
2                                     [Comedy, Romance]  
3                              [Comedy, Drama, 

In [14]:
import matplotlib.pyplot as plt
from ast import literal_eval

In [15]:
unique_genre = movies['genres'].explode().unique()
genre_distribution = train_data['genres'].explode().value_counts()
genre_dict = {k: v for v, k in enumerate(unique_genre)}
genre_dict

{'Adventure': 0,
 'Animation': 1,
 'Children': 2,
 'Comedy': 3,
 'Fantasy': 4,
 'Romance': 5,
 'Drama': 6,
 'Action': 7,
 'Crime': 8,
 'Thriller': 9,
 'Horror': 10,
 'Mystery': 11,
 'Sci-Fi': 12,
 'War': 13,
 'Musical': 14,
 'Documentary': 15,
 'IMAX': 16,
 'Western': 17,
 'Film-Noir': 18,
 '(no genres listed)': 19}

In [16]:
movies['movie_vector'] = ""
for ind, row in movies.iterrows():
    genres = row.genres
    movie_vector = np.zeros(len(genre_dict))

    for g in genres:
        movie_vector[genre_dict[g]] = 1
    movies.at[ind, 'movie_vector'] = movie_vector

# Save the final dataframe
movies.to_csv("movie_vector.csv")

In [17]:
user_ids = train_data['userId'].unique()
user_df = pd.DataFrame(columns=['userId', 'user_vector', 'avg_rating', 'num_movies_rated'])
                       
for user_id in user_ids:
#     print(user_id)
    user_rating_df = train_data[(train_data['userId'] == user_id)]
#     print(user_rating_df)

    user_vector = np.zeros(len(genre_dict))
    count_vector = np.zeros(len(genre_dict))
    
    user_avg_rating = 0
    movies_rated_count = 0
    
    for _, row in user_rating_df.iterrows():
        user_avg_rating += row.rating 
        movies_rated_count += 1
        genres = row.genres

        user_movie_vector = np.zeros(len(genre_dict))
        
        for g in genres:
            user_movie_vector[genre_dict[g]] = 1
            count_vector[genre_dict[g]] += 1
            
        user_vector += user_movie_vector*row.rating

    count_vector = np.where(count_vector==0, 1, count_vector)
    user_vector = np.divide(user_vector, count_vector)
    user_avg_rating /= movies_rated_count
    row_df = pd.DataFrame([[user_id, user_vector, user_avg_rating, movies_rated_count]], 
                          columns=['userId', 'user_vector', 'avg_rating', 'num_movies_rated'])
#     print(row_df)
    user_df = pd.concat([user_df, row_df], ignore_index=True)

In [18]:
user_df.to_csv("user_info.csv")

In [19]:
ratings_test = pd.read_csv("testing_data.csv", converters={"genres": literal_eval, "tag": literal_eval}) 
ratings_test.head()

Unnamed: 0,userId,movieId,rating,timestamp,genres,tag
0,1,3,4.0,964981247,"[Comedy, Romance]",[]
1,1,151,5.0,964984041,"[Action, Drama, Romance, War]",[]
2,1,223,3.0,964980985,[Comedy],[]
3,1,260,5.0,964981680,"[Action, Adventure, Sci-Fi]",[]
4,1,349,4.0,964982563,"[Action, Crime, Drama, Thriller]",[]


In [20]:
genres = user_rating_df[user_rating_df['movieId']==6].genres.values[0]
vector = np.zeros(len(genre_dict))
for g in genres:
    vector[genre_dict[g]] = 1
print(vector)

[0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [21]:
user_df[user_df['userId']==1].user_vector[0]

array([4.41176471, 4.57142857, 4.43333333, 4.31746032, 4.30555556,
       4.44444444, 4.50909091, 4.34666667, 4.31578947, 4.2       ,
       3.42857143, 4.1875    , 4.17647059, 4.47368421, 4.5625    ,
       0.        , 0.        , 4.4       , 5.        , 0.        ])

In [22]:
x = vector*user_df[user_df['userId']==1].user_vector[0]
print(x)
np.nanmean(np.where(x!=0,x,np.nan)) 

[0.         0.         0.         0.         0.         0.
 0.         4.34666667 4.31578947 4.2        0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.        ]


4.2874853801169595

In [23]:
algo_predictions = pd.DataFrame(columns=['userId', 'movieId', 'user_vector', 'movie_vector', 'og_rating', 'pred_rating'])
for ind, row in ratings_test.iterrows():
    userId = row['userId']
    movieId = row['movieId']
    og_rating = row['rating']
#     print(userId)
#     print(movieId)
    
    try:
        user_vector = user_df[user_df['userId'] == int(userId)].user_vector.values[0]
        movie_vector = movies[movies['movieId'] == int(movieId)].movie_vector.values[0]
    #     print(user_vector)
    #     print(movie_vector)

        predicted_rating = user_vector*movie_vector
    #     print(predicted_rating)
        
        if predicted_rating.any():
            predicted_rating = np.nanmean(np.where(predicted_rating!=0, predicted_rating, np.nan)) 
    #     print(predicted_rating)
        else:
            predicted_rating = 0

        row_df = pd.DataFrame([[userId, movieId, user_vector, movie_vector, og_rating, predicted_rating]], 
                    columns=['userId', 'movieId', 'user_vector', 'movie_vector', 'og_rating', 'pred_rating'])
        algo_predictions = pd.concat([algo_predictions, row_df], ignore_index=True)
    except:
        print("User not found: ", userId)

In [24]:
print(algo_predictions)

      userId movieId                                        user_vector  \
0          1       3  [4.411764705882353, 4.571428571428571, 4.43333...   
1          1     151  [4.411764705882353, 4.571428571428571, 4.43333...   
2          1     223  [4.411764705882353, 4.571428571428571, 4.43333...   
3          1     260  [4.411764705882353, 4.571428571428571, 4.43333...   
4          1     349  [4.411764705882353, 4.571428571428571, 4.43333...   
...      ...     ...                                                ...   
20163    610  160341  [3.727906976744186, 3.962962962962963, 3.64444...   
20164    610  160836  [3.727906976744186, 3.962962962962963, 3.64444...   
20165    610  162350  [3.727906976744186, 3.962962962962963, 3.64444...   
20166    610  163981  [3.727906976744186, 3.962962962962963, 3.64444...   
20167    610  170875  [3.727906976744186, 3.962962962962963, 3.64444...   

                                            movie_vector  og_rating  \
0      [0.0, 0.0, 0.0, 1.0, 

In [25]:
rmse = ((algo_predictions.og_rating - algo_predictions.pred_rating) ** 2).mean() ** .5
rmse

0.9206736741830838

In [26]:
mae = (((algo_predictions.og_rating - algo_predictions.pred_rating) ** 2) ** .5).mean()
mae

0.7117957531962288