In [43]:
#__author__: Robert Moreno Carrillo
#__Updated at__: 20/07/2021

# Taller TAWS
# Ejercicio de pandas y manipulación de fechas

In [87]:
import warnings
warnings.filterwarnings('ignore')

In [51]:
import numpy as np
import pandas as pd

In [52]:
# Lectura dedataframe
df = pd.read_csv("tweets_sample.csv")

In [53]:
df.head(2)

Unnamed: 0,created_at,user,followers_count,location,retweet_count,retweeted,userRT,full_text
0,2021-06-28 01:34:38,Retweet21454943,115.0,,14.0,0.0,clementevazce,RT El mejor jugador de Ecuador en los últimos ...
1,2021-06-28 00:55:51,Jeffsh_ec,28.0,,1.0,0.0,,"- Hernan Galindez: ""Perdón por este mensaje pe..."


In [56]:
df.shape

(1000, 8)

In [57]:
df.columns

Index(['created_at', 'user', 'followers_count', 'location', 'retweet_count',
       'retweeted', 'userRT', 'full_text'],
      dtype='object')

In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   created_at       974 non-null    object 
 1   user             974 non-null    object 
 2   followers_count  974 non-null    float64
 3   location         744 non-null    object 
 4   retweet_count    974 non-null    float64
 5   retweeted        974 non-null    float64
 6   userRT           649 non-null    object 
 7   full_text        974 non-null    object 
dtypes: float64(3), object(5)
memory usage: 62.6+ KB


In [67]:
# Borrar valores nulos...
## df.dropna()
df = df.dropna(subset = ['full_text'])

## Aplicando lo aprendido

In [88]:
# Remover hashtags del campo full_text
def remove_hashtags(t):
    words = t.split(" ")
    clean_tweet = []
    for w in words:
        if len(w)>0 and w[0]!= "#":
            clean_tweet.append(w)
    
    return " ".join(clean_tweet)

df["full_text"] = df["full_text"].apply(remove_hashtags)
df.head()

Unnamed: 0,created_at,user,followers_count,location,retweet_count,retweeted,userRT,full_text
0,2021-06-28 01:34:38,Retweet21454943,115.0,,14.0,0.0,clementevazce,RT El mejor jugador de Ecuador en los últimos ...
1,2021-06-28 00:55:51,Jeffsh_ec,28.0,,1.0,0.0,,"- Hernan Galindez: ""Perdón por este mensaje pe..."
2,2021-06-23 22:54:50,caminomundialok,2502.0,,0.0,0.0,,🇪🇨🆚🇵🇪🔁 [90'+1'] CAMBIO EN ECUADORFidel Martíne...
3,2021-06-28 02:11:36,MONTALVOGuille,646.0,Quito-Ecuador,5.0,0.0,diegolituma,RT Ecuador nos hace nuevamente sonreír gran em...
4,2021-06-28 00:39:06,joavbu19,207.0,guayaquil,174.0,0.0,DiegoArcos14,RT Recotizacion suprema en selección de Ecuado...


## Trabajando con fechas

In [82]:
# La columna created_at es de tipo datetime?


In [89]:
# Convertir la columna created_at en datetime
df["created_at"] = pd.to_datetime(df["created_at"])

In [94]:
# Obtener la cantidad de tweets por día en el dataset usando la columnas created_at
df[["created_at", "full_text"]].groupby("created_at").count()

Unnamed: 0_level_0,full_text
created_at,Unnamed: 1_level_1
2021-06-20 22:48:47,1
2021-06-20 22:49:22,1
2021-06-20 22:49:28,1
2021-06-20 22:49:40,1
2021-06-20 22:53:34,1
...,...
2021-06-29 22:50:20,1
2021-06-29 23:25:01,1
2021-06-29 23:33:52,1
2021-06-30 00:57:27,1


In [None]:
# descomentar la siguiente celda
dir(df["created_at"].dt)

In [102]:
df["date"] = df["created_at"].dt.date

In [105]:
# Obtener la cantidad de tweets por día en el dataset usando la columna date
df[["date", "full_text"]].groupby("date").count()

Unnamed: 0_level_0,full_text
date,Unnamed: 1_level_1
2021-06-20,27
2021-06-21,33
2021-06-22,17
2021-06-23,153
2021-06-24,114
2021-06-25,47
2021-06-26,24
2021-06-27,240
2021-06-28,290
2021-06-29,27


In [112]:
# Imprimir la hora
df["created_at"].dt.hour

0       1
1       0
2      22
3       2
4       0
       ..
995    23
996    22
997    20
998    14
999     2
Name: created_at, Length: 974, dtype: int64

In [115]:
df["time_range"] = pd.cut(df["created_at"].dt.hour,
       bins= [0, 6, 12, 18, 24], 
       labels=["madrugada", "mañana", "tarde", "noche"], 
       right=False)

In [116]:
# Guardar el nuevo dataset en un csv, y un excel
#df.to_csv(...)

Unnamed: 0,created_at,user,followers_count,location,retweet_count,retweeted,userRT,full_text,date,time_range
0,2021-06-28 01:34:38,Retweet21454943,115.0,,14.0,0.0,clementevazce,RT El mejor jugador de Ecuador en los últimos ...,2021-06-28,madrugada
1,2021-06-28 00:55:51,Jeffsh_ec,28.0,,1.0,0.0,,"- Hernan Galindez: ""Perdón por este mensaje pe...",2021-06-28,madrugada
2,2021-06-23 22:54:50,caminomundialok,2502.0,,0.0,0.0,,🇪🇨🆚🇵🇪🔁 [90'+1'] CAMBIO EN ECUADORFidel Martíne...,2021-06-23,noche
3,2021-06-28 02:11:36,MONTALVOGuille,646.0,Quito-Ecuador,5.0,0.0,diegolituma,RT Ecuador nos hace nuevamente sonreír gran em...,2021-06-28,madrugada
4,2021-06-28 00:39:06,joavbu19,207.0,guayaquil,174.0,0.0,DiegoArcos14,RT Recotizacion suprema en selección de Ecuado...,2021-06-28,madrugada
...,...,...,...,...,...,...,...,...,...,...
995,2021-06-25 23:20:42,alejandropilay,219.0,Ecuador,26.0,0.0,eluniversocom,RT Jacinto Espinoza cuestiona participación de...,2021-06-25,noche
996,2021-06-27 22:51:45,christopherfua,650.0,,172.0,0.0,pilumeneses,RT Lo veo a Hernán Galindez cantando el himno ...,2021-06-27,noche
997,2021-06-27 20:49:29,enelcamarin,57665.0,Chile,0.0,0.0,,Suplentes Ecuador:9. Leonardo Campana19. Gonza...,2021-06-27,noche
998,2021-06-26 14:00:10,ecubot593,173.0,,3.0,0.0,ECUAGOL,"RT 💣🚨 REVELA SU VERDAD 💣🚨Juan Cazares: ""No tuv...",2021-06-26,tarde
