In [1]:
import pandas as pd
import tweepy
from tweepy import OAuthHandler
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re

### Connecting with Twitter API

In [2]:
%run ./keys.ipynb
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

### Political Sentiment:
- Dataframes of tweets of each party. 
- Dataframes of mentions of users for each party. 

- Machine learning for evalutating the sentiment of tweets and mentions.

## General Functions

In [3]:
def extracting_tweets(account):
    """
    This function extract a data frame with the tweets of an account
    and filter the data frame with the columns selected.
    """
    # extracting the tweets
    tweets = [tweet for tweet in tweepy.Cursor(api.user_timeline,
                                           screen_name=account,
                                           tweet_mode='extended',
                                           exclude_replies=True,
                                              include_rts=False).items(2000)]
    # creating the data frame
    tweets_json = [tweet._json for tweet in tweets]
    df_tweets = pd.json_normalize(tweets_json)
    
    # selecting useful columns
    columns_selected = ['user.name', 'created_at', 'id', 'full_text', 'display_text_range', 
                    'source', 'retweet_count', 'favorite_count', 'user.followers_count', 
                    'user.friends_count', 'user.statuses_count']
    
    df_tweets_filtered = df_tweets[columns_selected]
    
    # cleaning date time
    df_tweets_filtered['created_at'] = pd.to_datetime(df_tweets_filtered['created_at'])
    
    # cleaning source of the tweet
    list_sources = list(df_tweets_filtered['source'])
    df_tweets_filtered['source'] = [re.findall(r'\>(.*?)\<', s) for s in list_sources]
    
    return df_tweets_filtered

In [4]:
# cleaning source column 

def cleaning_source(column):
    
    list_sources = list(column)
    
    sources_clean = [re.findall(r'\>(.*?)\<', s) for s in list_sources]
        
    return sources_clean

In [5]:
# date format. 

def transforming_format_dates(column):
    return pd.to_datetime(column)

In [6]:
def adding_party_column(df, string):
    df['partido'] = string
    return df

In [7]:
def adding_type_post_column(df, string):
    """
    string = publicación o mención
    """
    df['tipo de post'] = string
    return df

In [8]:
def extract_hashtags(column):
    """
    Input: column with all tweets. ['full_text'] in this case.
    Output: a list with used hashtags.
    """
    list_hashtags = list(column)
    hashtags = [re.findall(r"#(\w+)", tweet) for tweet in list_hashtags]
    
    return hashtags

## 1. PARTIDO POPULAR

In [9]:
df_pp = extracting_tweets('populares')

adding_party_column(df_pp, 'Partido Popular')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tweets_filtered['created_at'] = pd.to_datetime(df_tweets_filtered['created_at'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tweets_filtered['source'] = [re.findall(r'\>(.*?)\<', s) for s in list_sources]


Unnamed: 0,user.name,created_at,id,full_text,display_text_range,source,retweet_count,favorite_count,user.followers_count,user.friends_count,user.statuses_count,partido
0,Partido Popular,2021-04-24 09:46:56+00:00,1385893007694700553,➡️ A partir de las 12:00h. @pablocasado_ e @Id...,"[0, 227]",[Twitter Web App],18,38,821483,5291,102573,Partido Popular
1,Partido Popular,2021-04-24 09:16:38+00:00,1385885381996986370,De los más de 50M de € que Sánchez otorgó a un...,"[0, 268]",[Twitter Web App],57,102,821483,5291,102573,Partido Popular
2,Partido Popular,2021-04-24 07:56:46+00:00,1385865284309987329,El BOE no es un instrumento que Sánchez pueda ...,"[0, 279]",[Twitter for iPhone],101,202,821483,5291,102573,Partido Popular
3,Partido Popular,2021-04-24 06:01:06+00:00,1385836175345262592,"☀ Buenos días y #FelizSábado a todos, aquí os...","[0, 119]",[Twitter for iPhone],11,28,821483,5291,102573,Partido Popular
4,Partido Popular,2021-04-23 20:10:16+00:00,1385687486865760256,“Somos libres porque vivimos en Madrid”.\n\n👉 ...,"[0, 55]",[Twitter for iPhone],228,737,821483,5291,102573,Partido Popular
...,...,...,...,...,...,...,...,...,...,...,...,...
1870,Partido Popular,2021-01-12 11:55:31+00:00,1348961806270783488,"""Ante las catástrofes tenemos que estar todos ...","[0, 263]",[Twitter Media Studio],101,186,821482,5291,102573,Partido Popular
1871,Partido Popular,2021-01-12 11:42:10+00:00,1348958446964629510,"""Todos aquellos municipios que decidan declara...","[0, 242]",[Twitter Media Studio - LiveCut],43,102,821482,5291,102573,Partido Popular
1872,Partido Popular,2021-01-12 11:37:29+00:00,1348957267962507264,"""Pedimos a los ciudadanos su colaboración, que...","[0, 232]",[Twitter Media Studio - LiveCut],66,144,821482,5291,102573,Partido Popular
1873,Partido Popular,2021-01-12 10:55:48+00:00,1348946776447979520,💡El precio de la luz sigue batiendo récords.\n...,"[0, 102]",[Twitter Web App],964,1274,821482,5291,102573,Partido Popular


In [10]:
adding_type_post_column(df_pp, 'publicación')

Unnamed: 0,user.name,created_at,id,full_text,display_text_range,source,retweet_count,favorite_count,user.followers_count,user.friends_count,user.statuses_count,partido,tipo de post
0,Partido Popular,2021-04-24 09:46:56+00:00,1385893007694700553,➡️ A partir de las 12:00h. @pablocasado_ e @Id...,"[0, 227]",[Twitter Web App],18,38,821483,5291,102573,Partido Popular,publicación
1,Partido Popular,2021-04-24 09:16:38+00:00,1385885381996986370,De los más de 50M de € que Sánchez otorgó a un...,"[0, 268]",[Twitter Web App],57,102,821483,5291,102573,Partido Popular,publicación
2,Partido Popular,2021-04-24 07:56:46+00:00,1385865284309987329,El BOE no es un instrumento que Sánchez pueda ...,"[0, 279]",[Twitter for iPhone],101,202,821483,5291,102573,Partido Popular,publicación
3,Partido Popular,2021-04-24 06:01:06+00:00,1385836175345262592,"☀ Buenos días y #FelizSábado a todos, aquí os...","[0, 119]",[Twitter for iPhone],11,28,821483,5291,102573,Partido Popular,publicación
4,Partido Popular,2021-04-23 20:10:16+00:00,1385687486865760256,“Somos libres porque vivimos en Madrid”.\n\n👉 ...,"[0, 55]",[Twitter for iPhone],228,737,821483,5291,102573,Partido Popular,publicación
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1870,Partido Popular,2021-01-12 11:55:31+00:00,1348961806270783488,"""Ante las catástrofes tenemos que estar todos ...","[0, 263]",[Twitter Media Studio],101,186,821482,5291,102573,Partido Popular,publicación
1871,Partido Popular,2021-01-12 11:42:10+00:00,1348958446964629510,"""Todos aquellos municipios que decidan declara...","[0, 242]",[Twitter Media Studio - LiveCut],43,102,821482,5291,102573,Partido Popular,publicación
1872,Partido Popular,2021-01-12 11:37:29+00:00,1348957267962507264,"""Pedimos a los ciudadanos su colaboración, que...","[0, 232]",[Twitter Media Studio - LiveCut],66,144,821482,5291,102573,Partido Popular,publicación
1873,Partido Popular,2021-01-12 10:55:48+00:00,1348946776447979520,💡El precio de la luz sigue batiendo récords.\n...,"[0, 102]",[Twitter Web App],964,1274,821482,5291,102573,Partido Popular,publicación


In [11]:
df_pp['hashtags'] = extract_hashtags(df_pp['full_text'])

## 2. PSOE

In [12]:
df_psoe = extracting_tweets('psoe')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tweets_filtered['created_at'] = pd.to_datetime(df_tweets_filtered['created_at'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tweets_filtered['source'] = [re.findall(r'\>(.*?)\<', s) for s in list_sources]


In [13]:
adding_party_column(df_psoe, 'PSOE')
adding_type_post_column(df_psoe, 'publicación')
df_psoe['hashtags'] = extract_hashtags(df_psoe['full_text'])

## 3. PODEMOS

In [14]:
df_podemos = extracting_tweets('PODEMOS')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tweets_filtered['created_at'] = pd.to_datetime(df_tweets_filtered['created_at'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tweets_filtered['source'] = [re.findall(r'\>(.*?)\<', s) for s in list_sources]


In [15]:
adding_party_column(df_podemos, 'PODEMOS')
adding_type_post_column(df_podemos, 'publicación')
df_podemos['hashtags'] = extract_hashtags(df_podemos['full_text'])

## 4. CUIDADANOS

In [16]:
df_ciudadanos = extracting_tweets('ciudadanoscs')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tweets_filtered['created_at'] = pd.to_datetime(df_tweets_filtered['created_at'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tweets_filtered['source'] = [re.findall(r'\>(.*?)\<', s) for s in list_sources]


In [17]:
adding_party_column(df_ciudadanos, 'CIUDADANOS')
adding_type_post_column(df_ciudadanos, 'publicación')
df_ciudadanos['hashtags'] = extract_hashtags(df_ciudadanos['full_text'])

## 5. VOX

In [18]:
df_vox = extracting_tweets('vox_es')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tweets_filtered['created_at'] = pd.to_datetime(df_tweets_filtered['created_at'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tweets_filtered['source'] = [re.findall(r'\>(.*?)\<', s) for s in list_sources]


In [19]:
adding_party_column(df_vox, 'PSOE')
adding_type_post_column(df_vox, 'publicación')
df_vox['hashtags'] = extract_hashtags(df_vox['full_text'])

## All together

In [20]:
frames = [df_pp, df_psoe, df_ciudadanos, df_podemos, df_vox ]
df_final_tweets = pd.concat(frames)

In [21]:
df_final_tweets.shape

(6177, 14)

## Exporting the final dataframe

In [22]:
df_final_tweets.to_csv('../data/processed/df_final_2000.csv')