In [None]:
import tweepy
import json
from pymongo import MongoClient
from collections import Counter
from collections.abc import MutableMapping
import logging
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx
import community
import seaborn as sns
from fastprogress import master_bar, progress_bar
import time
import re

import nltk
#nltk.download()
#nltk.download('stopwords')
from nltk.corpus import stopwords
import advertools as adv

#Logger
logging.basicConfig(filename='Text-TweetsAnalysis.log', format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
logging.getLogger().addHandler(ch)

In [None]:
##############
# PARAMETERS #
##############

client = MongoClient(username='XXX', password='XXX')

DatabaseName = "Streaming"
CollectionName = "Campanya-Interactions"

db = client[DatabaseName]

In [None]:
###############
#  STOPWORDS  #
###############

stopwords_spanish = nltk.corpus.stopwords.words('spanish')
stopwords_english = nltk.corpus.stopwords.words('english')
stopwords_catalan = adv.stopwords['catalan']

custom_stopwords = ['none', '', 'q', 'l', '\n', 'rt']

stopwords = stopwords_spanish + stopwords_english + list(stopwords_catalan)
stopwords.extend(custom_stopwords)

In [None]:
#########################
# TWEETS TEXT ANALYSIS  #
#########################

#He ampliat l'emoji_pattern per a que tingui també en compte els nous emojis
emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U0001F1F2-\U0001F1F4"  # Macau flag
        u"\U0001F1E6-\U0001F1FF"  # flags
        u"\U0001F600-\U0001F64F"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U0001F1F2"
        u"\U0001F1F4"
        u"\U0001F620"
        u"\u200d"
        u"\u2640-\u2642"
                           "]+", flags=re.UNICODE)

def clean_tweet(twit_text):
    twit_text_filtered = re.sub(r'[#@]', '', twit_text)
    twit_text_filtered_2 = re.sub(r'(\s)http\S+', '', twit_text_filtered)
    twit_text_filtered_3 = emoji_pattern.sub(r'', twit_text_filtered_2)
    #twit_text_filtered_3 = twit_text_filtered_2.encode('ascii', 'ignore').decode('ascii') 
    return twit_text_filtered_3


def load_tweets(collection):
    """Extracts the ObjectID and created_at of users
    
    Keyword arguments:
    user_collection -- MongoDB Users' Collection
    """
    pipeline = [
                {
                    '$match': {
                        'text': {            
                            '$exists': True
                        },
                    }
                }, {
                    '$project': {
                        '_id': True, 
                        'text': True
                    }
                }
            ]
    print("Query", end=" ")
    tweets = collection.aggregate(pipeline, allowDiskUse=True)
    print("OK; List", end=" ")
    tweets = list(tweets)
    print("OK; Total tweets:", len(tweets))
    return tweets


tweets_list = load_tweets(db[CollectionName])

text_total = ""
for tweet in progress_bar(tweets_list):
    text_total += " " + clean_tweet(str(tweet['text']))

text_length = len(text_total)
print(text_length)

print("Divide string")
sub_string1 = text_total[0:int(text_length/4)]
print(len(sub_string1))

print("SPLIT")
tokens = [w.strip('“”.,;:-():!?-‘’|/•&+* ') for w in re.split(r"[ ']+", sub_string1.lower())]

# Create counter
counts = Counter(tokens)
print(len(counts))
counts.most_common(150)

# TEMP
important_tokens = [important_token for important_token in tokens if important_token not in stopwords]
print(len(important_tokens))

# Create counter
counts = Counter(important_tokens)
print(len(counts))
counts.most_common(150)

In [None]:
#####################
# REMOVE STOPWORDS  #
#####################

stopwords_spanish = nltk.corpus.stopwords.words('spanish')
stopwords_english = nltk.corpus.stopwords.words('english')
stopwords_catalan = adv.stopwords['catalan']

custom_stopwords = ['none', '', 'q', 'l', '\n', 'rt']

stopwords = stopwords_spanish + stopwords_english + list(stopwords_catalan)
stopwords.extend(custom_stopwords)

important_tokens = [important_token for important_token in tokens if important_token not in stopwords]
print(len(important_tokens))

# Create counter
counts = Counter(important_tokens)
print(len(counts))
counts.most_common(150)

In [None]:
###########################################################
# TWEET TEXT ANALYSIS BY COMMUNITY IMPROVED (apostrophe)  #
###########################################################

# MADRID
COMMUNITIES_LIST = ["VOX", "PODEMOS", "PSOE", "PP", "MAS_MAD", "CS"]

# ANDALUSIA
# COMMUNITIES_LIST = ["VOX", "POR_AND", "PSOE", "PP", "ADELANTE_AND", "CS"]

def load_tweets_by_community(collection, community):
    """Extracts the ObjectID and created_at of users
    
    Keyword arguments:
    user_collection -- MongoDB Users' Collection
    """
    pipeline = [
                {
                    '$match': {
                        'ACTIVE_community': community, 
                    }
                }, {
                    '$project': {
                        '_id': True, 
                        'text': True
                    }
                }
            ]
    print("Query", end=" ")
    tweets = collection.aggregate(pipeline, allowDiskUse=True)
    print("OK; List", end=" ")
    tweets = list(tweets)
    print("OK; Total tweets:", len(tweets))
    return tweets

for community in COMMUNITIES_LIST:
    print("******************************************************")
    print("                      " + community)
    print("******************************************************")
    tweets_by_community_list = load_tweets_by_community(db[CollectionName], community)

    text_total = ""
    for tweet in tweets_by_community_list:
        text_total += " " + clean_tweet(str(tweet['text']))

    tokens = [w.strip('“”.,;:-():!?-‘’|/•&+* ') for w in re.split(r"[ ']+", text_total.lower())]   
    important_tokens = [important_token for important_token in tokens if important_token not in stopwords]

    counts = Counter(important_tokens)
    print("Total Tokens: " + str(len(counts)))
    print(counts.most_common(50))