# Tags

In [55]:
import pandas as pd
import missingno as msno
import numpy as np

In [56]:
df = pd.read_csv(r"../data/tag.csv")

In [57]:
df.columns = df.columns.str.lower() # en minuscular
df.columns = df.columns.str.strip() # quitar espacios en blanco
df.head()

Unnamed: 0,userid,movieid,tag,timestamp
0,18,4141,Mark Waters,2009-04-24 18:19:40
1,65,208,dark hero,2013-05-10 01:41:18
2,65,353,dark hero,2013-05-10 01:41:19
3,65,521,noir thriller,2013-05-10 01:39:43
4,65,592,dark hero,2013-05-10 01:41:18


In [58]:
# Total de registros
len(df)

465564

In [59]:
# Verificamos que no haya nulos
df.isna().sum()

userid        0
movieid       0
tag          16
timestamp     0
dtype: int64

In [60]:
# Cambiar el tipo de datos `object` de timestamp a datetime
df["timestamp"] = pd.to_datetime(df["timestamp"])

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 465564 entries, 0 to 465563
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   userid     465564 non-null  int64         
 1   movieid    465564 non-null  int64         
 2   tag        465548 non-null  object        
 3   timestamp  465564 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 14.2+ MB


In [62]:
# consultar columnas de valores ausentes en columnas dadas
df[df.tag.isna()]

Unnamed: 0,userid,movieid,tag,timestamp
373276,116460,123,,2008-01-04 12:47:47
373277,116460,346,,2008-01-04 13:05:46
373281,116460,1184,,2008-01-04 13:11:01
373288,116460,1785,,2008-01-04 13:06:46
373289,116460,2194,,2008-01-04 12:44:37
373291,116460,2691,,2008-01-04 12:50:02
373299,116460,4103,,2008-01-04 13:05:20
373301,116460,4473,,2008-01-04 12:50:40
373303,116460,4616,,2008-01-04 13:14:01
373319,116460,7624,,2008-01-04 13:11:06


In [63]:
# Eliminamos los nulos
df = df.dropna(subset=["tag"], how="all")

In [64]:
# Verificamos que no haya nulos
df.isna().sum()

userid       0
movieid      0
tag          0
timestamp    0
dtype: int64

In [65]:
len(df)

465548

In [66]:
465564 - 465548

16

In [67]:
es_numero = pd.to_numeric(df['tag'], errors='coerce').notna()
df = df[~es_numero]
df

Unnamed: 0,userid,movieid,tag,timestamp
0,18,4141,Mark Waters,2009-04-24 18:19:40
1,65,208,dark hero,2013-05-10 01:41:18
2,65,353,dark hero,2013-05-10 01:41:19
3,65,521,noir thriller,2013-05-10 01:39:43
4,65,592,dark hero,2013-05-10 01:41:18
...,...,...,...,...
465559,138446,55999,dragged,2013-01-23 23:29:32
465560,138446,55999,Jason Bateman,2013-01-23 23:29:38
465561,138446,55999,quirky,2013-01-23 23:29:38
465562,138446,55999,sad,2013-01-23 23:29:32


In [68]:
df_strings = df[df['tag'].apply(lambda x: isinstance(x, str))]

In [69]:
df_strings['tag'] = df_strings['tag'].str.replace(r'[^a-zA-Z0-9]', '', regex=True)

In [70]:
df_strings

Unnamed: 0,userid,movieid,tag,timestamp
0,18,4141,MarkWaters,2009-04-24 18:19:40
1,65,208,darkhero,2013-05-10 01:41:18
2,65,353,darkhero,2013-05-10 01:41:19
3,65,521,noirthriller,2013-05-10 01:39:43
4,65,592,darkhero,2013-05-10 01:41:18
...,...,...,...,...
465559,138446,55999,dragged,2013-01-23 23:29:32
465560,138446,55999,JasonBateman,2013-01-23 23:29:38
465561,138446,55999,quirky,2013-01-23 23:29:38
465562,138446,55999,sad,2013-01-23 23:29:32


In [71]:
def join_unique_tags(tags):
    return ', '.join(pd.unique(tags))

In [72]:
df = df_strings.groupby('movieid')['tag'].apply(join_unique_tags)

In [75]:
# # Guardado de tabla de dimension
df.to_csv(r"../data/partial_tags.csv")