# Rating

In [107]:
import pandas as pd
import missingno as msno
import numpy as np

In [108]:
rating = pd.read_csv(r"../data/rating.csv")

In [109]:
rating.columns = rating.columns.str.lower() # en minuscular
rating.columns = rating.columns.str.strip() # quitar espacios en blanco
rating.head()

Unnamed: 0,userid,movieid,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [110]:
# Total de registros
len(rating)

20000263

In [111]:
# Verificamos que no haya nulos
rating.isna().sum()

userid       0
movieid      0
rating       0
timestamp    0
dtype: int64

In [112]:
# Cambiar el tipo de datos `object` de timestamp a datetime
rating["timestamp"] = pd.to_datetime(rating["timestamp"])

In [113]:
rating.head()

Unnamed: 0,userid,movieid,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [114]:
rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000263 entries, 0 to 20000262
Data columns (total 4 columns):
 #   Column     Dtype         
---  ------     -----         
 0   userid     int64         
 1   movieid    int64         
 2   rating     float64       
 3   timestamp  datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(2)
memory usage: 610.4 MB


In [115]:
# Adicionamos la columna `year` y llenar con el año del timestamp
rating["year"] =  rating['timestamp'].dt.year
rating["month"] =  rating['timestamp'].dt.month
rating.head()

Unnamed: 0,userid,movieid,rating,timestamp,year,month
0,1,2,3.5,2005-04-02 23:53:47,2005,4
1,1,29,3.5,2005-04-02 23:31:16,2005,4
2,1,32,3.5,2005-04-02 23:33:39,2005,4
3,1,47,3.5,2005-04-02 23:32:07,2005,4
4,1,50,3.5,2005-04-02 23:29:40,2005,4


In [119]:
# Obtenemos el rating promedio y el conteo general 
rating_movies_promedio = rating.groupby(["movieid"])["rating"].mean().reset_index("movieid")
rating_movies_conteo = rating.groupby(["movieid"])["rating"].count().reset_index("movieid")


In [117]:
# Promedio de rating por pelicula de forma global en todos los años
rating_movies_promedio.head()

Unnamed: 0,movieid,rating
0,1,3.92124
1,2,3.211977
2,3,3.15104
3,4,2.861393
4,5,3.064592


In [120]:
# Conteo de rating por pelicula de forma global en todos los años
rating_movies_conteo.head()

Unnamed: 0,movieid,rating
0,1,49695
1,2,22243
2,3,12735
3,4,2756
4,5,12161


In [121]:
# union de promeido y conteo
rating_promedio_conteo = pd.merge(rating_movies_promedio, rating_movies_conteo, on="movieid", how="left")

In [122]:
rating_promedio_conteo.head()

Unnamed: 0,movieid,rating_x,rating_y
0,1,3.92124,49695
1,2,3.211977,22243
2,3,3.15104,12735
3,4,2.861393,2756
4,5,3.064592,12161


In [123]:
# renombrar columnas
rating_promedio_conteo = rating_promedio_conteo.rename(
    columns={
        "rating_x": "rating_promedio",
        "rating_y": "rating_conteo"
    }
)

In [124]:
rating_promedio_conteo.head()

Unnamed: 0,movieid,rating_promedio,rating_conteo
0,1,3.92124,49695
1,2,3.211977,22243
2,3,3.15104,12735
3,4,2.861393,2756
4,5,3.064592,12161


In [125]:
# Es el total de registros
rating_promedio_conteo["rating_conteo"].sum()
# Esta OK

np.int64(20000263)

### Esto es para una tabla de dimensión `d_rating`
Contendra la pelicula agrupado por usuario, año y por mes

In [136]:
# Obtenemos el rating promedio por pelicula, año y mes
grupo = rating.groupby(["movieid","userid", "year","month"])["rating"]
rating_movies_year_month_promedio = grupo.mean().reset_index()
rating_movies_year_month_conteo = grupo.count().reset_index()

In [137]:
# Deberia ser una table dimension
rating_movies_year_month_promedio.head()

Unnamed: 0,movieid,userid,year,month,rating
0,1,3,1999,12,4.0
1,1,6,1997,3,5.0
2,1,8,1996,6,4.0
3,1,10,1999,11,4.0
4,1,11,2009,1,4.5


In [138]:
rating_movies_year_month_conteo.head()

Unnamed: 0,movieid,userid,year,month,rating
0,1,3,1999,12,1
1,1,6,1997,3,1
2,1,8,1996,6,1
3,1,10,1999,11,1
4,1,11,2009,1,1


In [139]:
# union de promeido y conteo
dim_rating = pd.merge(rating_movies_year_month_promedio, rating_movies_year_month_conteo, on=["movieid","userid", "year","month"], how="left")

In [140]:
dim_rating.head()

Unnamed: 0,movieid,userid,year,month,rating_x,rating_y
0,1,3,1999,12,4.0,1
1,1,6,1997,3,5.0,1
2,1,8,1996,6,4.0,1
3,1,10,1999,11,4.0,1
4,1,11,2009,1,4.5,1


In [141]:
# renombrar columnas
dim_rating = dim_rating.rename(
    columns={
        "rating_x": "promedio",
        "rating_y": "conteo"
    }
)

In [142]:
dim_rating.head(30)

Unnamed: 0,movieid,userid,year,month,promedio,conteo
0,1,3,1999,12,4.0,1
1,1,6,1997,3,5.0,1
2,1,8,1996,6,4.0,1
3,1,10,1999,11,4.0,1
4,1,11,2009,1,4.5,1
5,1,12,1997,3,4.0,1
6,1,13,1996,11,4.0,1
7,1,14,2008,10,4.5,1
8,1,16,2001,5,3.0,1
9,1,19,1997,2,5.0,1
