# Movies

In [1]:
import pandas as pd
import missingno as msno
import numpy as np

In [2]:
df = pd.read_csv(r"../data/process/movie.csv")


In [4]:
df.columns = df.columns.str.lower() # en minuscular
df.columns = df.columns.str.strip() # quitar espacios en blanco
df.head()

Unnamed: 0,movieid,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
# Total de registros
len(df)

27278

In [6]:
# Verificamos que no haya nulos
df.isna().sum()

movieid    0
title      0
genres     0
dtype: int64

In [7]:
movie = df.copy()
genres_dummies = movie['genres'].str.get_dummies('|') # Separa generos por '|' y crear One-Hot Encoding

In [8]:
genres_dummies.head()

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [6]:
rating = pd.read_csv(r"../data/partial_rating.csv")

In [7]:
rating.head()

Unnamed: 0,movieid,rating_promedio,rating_conteo
0,1,3.92124,49695
1,2,3.211977,22243
2,3,3.15104,12735
3,4,2.861393,2756
4,5,3.064592,12161


In [8]:
df = pd.merge(df, rating, on="movieid", how="left")

In [9]:
df.head()

Unnamed: 0,movieid,title,genres,rating_promedio,rating_conteo
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.92124,49695.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.211977,22243.0
2,3,Grumpier Old Men (1995),Comedy|Romance,3.15104,12735.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.861393,2756.0
4,5,Father of the Bride Part II (1995),Comedy,3.064592,12161.0


In [10]:
tags = pd.read_csv(r"../data/partial_tags.csv")

In [11]:
df = pd.merge(df, tags, on="movieid", how="left")

In [12]:
df.head()

Unnamed: 0,movieid,title,genres,rating_promedio,rating_conteo,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.92124,49695.0,"Watched, computeranimation, Disneyanimatedfeat..."
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.211977,22243.0,"timetravel, adaptedfrombook, boardgame, childh..."
2,3,Grumpier Old Men (1995),Comedy|Romance,3.15104,12735.0,"oldpeoplethatisactuallyfunny, sequelfever, gru..."
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.861393,2756.0,"chickflick, revenge, characters, CLV"
4,5,Father of the Bride Part II (1995),Comedy,3.064592,12161.0,"DianeKeaton, family, sequel, SteveMartin, wedd..."


In [13]:
df = df.set_index('movieid')


In [14]:
df.head()

Unnamed: 0_level_0,title,genres,rating_promedio,rating_conteo,tag
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.92124,49695.0,"Watched, computeranimation, Disneyanimatedfeat..."
2,Jumanji (1995),Adventure|Children|Fantasy,3.211977,22243.0,"timetravel, adaptedfrombook, boardgame, childh..."
3,Grumpier Old Men (1995),Comedy|Romance,3.15104,12735.0,"oldpeoplethatisactuallyfunny, sequelfever, gru..."
4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.861393,2756.0,"chickflick, revenge, characters, CLV"
5,Father of the Bride Part II (1995),Comedy,3.064592,12161.0,"DianeKeaton, family, sequel, SteveMartin, wedd..."


In [15]:
# Guardado de tabla de hecho
df.to_csv(r"../data/h_movie.csv")