In [149]:
from csv import reader
from math import sqrt
import pandas as pd
import pickle

# reading csv files


In [151]:
movie_columns = ['movieId', 'title', 'genres']
tag_columns = ['movieId', 'userId', 'tag']
ratings_columns = ['movieId', 'userId', 'rating', 'timestamp']
movies_df = pd.read_csv("../data/movies.csv")[movie_columns]
tags_df = pd.read_csv("../data/tags.csv")[tag_columns]
ratings_df = pd.read_csv("../data/ratings.csv")[ratings_columns]

# column categorization

In [152]:
category = {}
category["title"] = dict(enumerate(movies_df['title'].astype('category').cat.categories))
category["genres"] = dict(enumerate(movies_df['genres'].astype('category').cat.categories))
category["tag"] = dict(enumerate(tags_df['tag'].astype('category').cat.categories))

In [153]:
tags_df['tag']=tags_df['tag'].astype('category').cat.codes
movies_df['title']=movies_df['title'].astype('category').cat.codes
movies_df['genres']=movies_df['genres'].astype('category').cat.codes

# join

In [154]:
tags_grouped_df = tags_df.groupby(['movieId', 'userId'], as_index=False)['tag'].agg(list)
df_tag_rating =  ratings_df.merge(tags_grouped_df, how='left', on=['movieId', 'userId'])
df_movies_tag_rating =  df_tag_rating.merge(movies_df, how='inner', on=['movieId'])

In [155]:
df_movies_tag_rating.rename(columns = {'timestamp':'timestamp_rating'}, inplace = True)
df_movies = df_movies_tag_rating[['movieId', 'userId', 'rating', 'timestamp_rating', 'tag', 'title', 'genres']] 

# validate results

In [159]:
movies_df = pd.read_csv("../data/movies.csv")[movie_columns]
movies_df.loc[(movies_df['movieId']==1)]

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [165]:
tags_df = pd.read_csv("../data/tags.csv")[tag_columns]
tags_df.loc[(tags_df['userId']==791)].iloc[0]['tag']

'Owned'

In [157]:
df_movies_tag_rating.loc[(df_movies_tag_rating['movieId']==1)&(df_movies_tag_rating['userId']==791)]

Unnamed: 0,movieId,userId,rating,timestamp_rating,tag,title,genres
601448,1,791,4.5,1515175489,[19014],57003,594


In [158]:
print(f"""TAG: {category['tag'][19014]}
TITLE: {category['title'][57003]}
GENRE: {category['genres'][594]}""")

TAG: Owned
TITLE: Toy Story (1995)
GENRE: Adventure|Animation|Children|Comedy|Fantasy


# Save

In [166]:
df_movies_tag_rating.to_parquet('../results/entity.parquet')

In [167]:
with open('../results/category.pickle', 'wb') as handle:
    pickle.dump(category, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [148]:
pd.read_parquet('../results/entity.parquet')

Unnamed: 0,movieId,userId,rating,timestamp_rating,tag,title,genres
0,296,1,5.0,1147880044,,39812,1160
1,296,3,5.0,1439474476,,39812,1160
2,296,4,4.0,1573938898,,39812,1160
3,296,5,4.0,830786155,,39812,1160
4,296,7,4.0,835444730,,39812,1160
...,...,...,...,...,...,...,...
25000090,200192,162358,2.0,1553453039,,13650,0
25000091,200194,162358,2.0,1553453843,,56951,216
25000092,139970,162386,3.5,1549215965,,23941,1136
25000093,200726,162386,4.0,1554651417,,51178,1087


In [168]:
with open('../results/category.pickle', 'rb') as handle:
    b = pickle.load(handle)

In [169]:
b

{'title': {0: '"BLOW THE NIGHT!" Let\'s Spend the Night Together (1983)',
  1: '"Great Performances" Cats (1998)',
  2: '#1 Cheerleader Camp (2010)',
  3: '#Captured (2017)',
  4: '#Female Pleasure (2018)',
  5: '#FollowMe (2019)',
  6: '#Horror (2015)',
  7: '#Lucky Number (2015)',
  8: '#SCREAMERS (2016)',
  9: '#SquadGoals (2018)',
  10: '#Stuck (2014)',
  11: '#realityhigh (2017)',
  12: '$ (Dollars) (1971)',
  13: '$1,000 on the Black (1966)',
  14: '$100,000 for Ringo (1965)',
  15: '$5 a Day (2008)',
  16: '$50K and a Call Girl: A Love Story (2014)',
  17: '$9.99 (2008)',
  18: '$ellebrity (Sellebrity) (2012)',
  19: "$uperthief: Inside America's Biggest Bank Score (2012)",
  20: "'49-'17 (1917)",
  21: "'63 Boycott (2016)",
  22: "'68 (1988)",
  23: "'71 (2014)",
  24: "'Gator Bait (1974)",
  25: "'Hellboy': The Seeds of Creation (2004)",
  26: "'Human' Factor, The (Human Factor, The) (1975)",
  27: "'Master Harold' ... And the Boys (2010)",
  28: "'Master Harold'... and the Bo