In [7]:
import pandas as pd
import numpy as np
import plotly.express as px
from wordcloud import WordCloud
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import datetime

import os 

k_colors = ['rgb(0,127,206)','rgb(62,196,4)','rgb(255,125,16)','rgb(253,99,90)']#blue,gree,orange,red
plotly_themes = ["plotly", "plotly_white", "plotly_dark", "ggplot2", "seaborn", "simple_white", "none"]#plotly_white plotly_dark
theme = plotly_themes[1]


In [1]:
#path to data files
notebook_path = os.getcwd()
project_path = notebook_path[0:len(notebook_path)-25]+"data/"
project_code_path = notebook_path[0:len(notebook_path)-25]+"code/"

import sys
sys.path.insert(1, project_code_path)

from movies import Movies
from genres import Genres
from tags import Tags
from users import Users

## Reading the data

In [4]:
movies = pd.read_csv(project_path+'movie.csv')
movies.head(1)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [5]:
tag = pd.read_csv(project_path+'tag_filtered.csv')
tag.head(1)

Unnamed: 0,userId,movieId,tag,timestamp,year_month,processed_tag
0,65,208,dark hero,2013-05-10 01:41:18,201305,dark hero


In [6]:
ratings = pd.read_csv(project_path+'ratings_filtered.csv')
ratings["year_month"]=pd.DatetimeIndex(ratings['timestamp']).year*100+\
pd.DatetimeIndex(ratings['timestamp']).month
ratings.head(1)

Unnamed: 0,userId,movieId,rating,timestamp,year_month
0,31,1,3.0,2015-02-23 23:18:07,201502


## Generating the movies general vars and history variables

In [8]:
%time Movies_obj = Movies(movies,ratings)
movies_vars = Movies_obj.df_movies.iloc[:, ~Movies_obj.df_movies.columns.isin(["genres", "year", "genre_list","title"])]
movies_history_vars = Movies_obj.df_movies_history_ratings.iloc[:, ~Movies_obj.df_movies_history_ratings.columns.isin(["count", "mean_rating", "median_rating"])]

CPU times: user 31.2 s, sys: 928 ms, total: 32.1 s
Wall time: 33.4 s


In [9]:
movies_vars.head(1)

Unnamed: 0,movieId,Sci-Fi,Horror,Action,Musical,Documentary,Romance,Film-Noir,Crime,Animation,Drama,Adventure,Mystery,Fantasy,War,IMAX,Comedy,Children,Thriller,Western
0,1,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,1,1,0,0


In [10]:
movies_history_vars.head(1)

Unnamed: 0,movieId,year_month,last_1_months_movie_rating,last_1_months_movie_rating_counts,mean_3_months_movie_rating,mean_6_months_movie_rating,last_1_month_per_3_months_movie_rating,last_1_month_per_6_months_movie_rating
0,1,201301,,,,,,


## Generating the genre history variables

In [12]:
%time Genre_obj = Genres(Movies_obj.df_movies,Movies_obj.df_movies_history_ratings)
genre_vars = Genre_obj.genres_df.iloc[:, ~Genre_obj.genres_df.columns.isin(["count", "mean_rating", "median_rating"])]

CPU times: user 985 ms, sys: 589 ms, total: 1.57 s
Wall time: 2.05 s


In [13]:
genre_vars.head(1)

Unnamed: 0,movieId,year_month,last_1_months_genre_rating,last_1_months_genre_rating_counts,mean_3_months_genre_rating,mean_6_months_genre_rating,last_1_month_per_3_months_genre_rating,last_1_month_per_6_months_genre_rating
0,1,201301,,,,,,


## Generating the users history variables

In [14]:
%time Users_obj = Users(ratings)
users_vars = Users_obj.users_df.iloc[:, ~Users_obj.users_df.columns.isin(["count", "mean_rating", "median_rating"])]

CPU times: user 2.51 s, sys: 172 ms, total: 2.68 s
Wall time: 2.86 s


In [15]:
users_vars.head(1)

Unnamed: 0,userId,year_month,last_1_months_user_rating,last_1_months_user_rating_counts,mean_3_months_user_rating,mean_6_months_user_rating,last_1_month_per_3_months_user_rating,last_1_month_per_6_months_user_rating
0,31,201502,,,,,,


## Generating the tags weighted rating variable

In [16]:
%time Tags_obj = Tags(tag,ratings)
tags_vars = Tags_obj.tags_df

CPU times: user 2min 35s, sys: 2.97 s, total: 2min 38s
Wall time: 2min 43s


In [19]:
tags_vars["year_month"] = tags_vars["year_month"].astype(int)
tags_vars.head(1)

Unnamed: 0,movieId,year_month,last_6_months_weighted_rating
0,1,201405,3.827056


## Merging all variables

In [22]:
ratings_to_join = ratings[ratings["year_month"]>201410]
ratings_to_join['target_var'] = 0
ratings_to_join.loc[ratings_to_join['rating']>= 4.0,"target_var"]=1
ratings_to_join = ratings_to_join.iloc[:, ~ratings_to_join.columns.isin(["timestamp"])]

ratings_to_train=ratings_to_join.merge(movies_vars,"left")\
    .merge(movies_history_vars,"left")\
    .merge(tags_vars,"left")\
    .merge(genre_vars,"left")\
    .merge(users_vars,"left")

#caso informado users_vars[users_vars.userId==96]


For any variable that was missing, lets substitute it with the mean in the month of the variable

In [23]:
ratings_to_train = ratings_to_train.groupby(['year_month'], sort=False)\
.apply(lambda x: x.fillna(x.mean()))\
.reset_index(drop = True)

In [25]:
ratings_to_train.dtypes

userId                                      int64
movieId                                     int64
rating                                    float64
year_month                                  int64
target_var                                  int64
Sci-Fi                                      int64
Horror                                      int64
Action                                      int64
Musical                                     int64
Documentary                                 int64
Romance                                     int64
Film-Noir                                   int64
Crime                                       int64
Animation                                   int64
Drama                                       int64
Adventure                                   int64
Mystery                                     int64
Fantasy                                     int64
War                                         int64
IMAX                                        int64


Lets save the dataset to start modeling

In [26]:
ratings_to_train.to_pickle("../../data/clened_df_toTrain.pkl")