In [None]:
import tweepy
import json
from pymongo import MongoClient
from collections import Counter
from collections.abc import MutableMapping
import logging
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx
import community
import seaborn as sns
from fastprogress import master_bar, progress_bar
import time
import re

import nltk
#nltk.download()
#nltk.download('stopwords')
from nltk.corpus import stopwords
import advertools as adv

#Logger
logging.basicConfig(filename='Text-SentimentAnalysis.log', format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
logging.getLogger().addHandler(ch)

In [None]:
##############
# PARAMETERS #
##############

client = MongoClient(username='XXX', password='XXX')

DatabaseName = "Streaming"
TweetCollectionName = "Campanya-Sentiment"

db = client[DatabaseName]
tweetCollection = db[TweetCollectionName]

In [None]:
######################
# ANALYZE TWEET LANG #
######################

def load_tweets(collection):
    """Extracts the tweet bot interaction information
    
    Keyword arguments:
    collection -- MongoDB Tweets' Collection
    """
    pipeline = [
                {
                    '$project': {
                        'lang': True
                    }
                }, {
                    '$group': {
                        '_id': {'lang':'$lang'}, 
                        'count': {
                            '$sum': 1
                        }
                    }
                }
            ]
    print("Query", end=" ")
    tweets = collection.aggregate(pipeline, allowDiskUse=True)
    print("OK; List", end=" ")
    tweets = list(tweets)
    print("OK; Total combinations:", len(tweets))
    return tweets

tweets = load_tweets(tweetCollection)
tweets_df = pd.DataFrame(tweets)
tweets_df2 = pd.concat([pd.json_normalize(tweets_df['_id']), tweets_df['count']], axis=1)

totalTweets = tweets_df2['count'].sum()
print("TOTAL TWEETS: " + str(totalTweets))

print(tweets_df2.sort_values('count', ascending=False).to_string(index=False))

tweets_df3 = tweets_df2.groupby(['lang']).agg({'count': 'sum'})
tweets_df4 = tweets_df3.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))

print(tweets_df4.sort_values('lang'))

In [None]:
#########################################
# ANALYZE TWEET LANG + ACTIVE COMMUNITY #
#########################################

def load_tweets(collection):
    """Extracts the tweet bot interaction information
    
    Keyword arguments:
    collection -- MongoDB Tweets' Collection
    """
    pipeline = [
                {
                    '$project': {
                        'ACTIVE_community': True,
                        'lang': True
                    }
                }, {
                    '$group': {
                        '_id': {'ACTIVE_community':'$ACTIVE_community', 'lang':'$lang'}, 
                        'count': {
                            '$sum': 1
                        }
                    }
                }
            ]
    print("Query", end=" ")
    tweets = collection.aggregate(pipeline, allowDiskUse=True)
    print("OK; List", end=" ")
    tweets = list(tweets)
    print("OK; Total combinations:", len(tweets))
    return tweets

tweets = load_tweets(tweetCollection)
tweets_df = pd.DataFrame(tweets)
tweets_df2 = pd.concat([pd.json_normalize(tweets_df['_id']), tweets_df['count']], axis=1)

totalTweets = tweets_df2['count'].sum()
print("TOTAL TWEETS: " + str(totalTweets))

print(tweets_df2.sort_values('count', ascending=False).to_string(index=False))

tweets_df3 = tweets_df2.groupby(['ACTIVE_community', 'lang']).agg({'count': 'sum'})
tweets_df4 = tweets_df3.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))

print(tweets_df4.sort_values('ACTIVE_community').to_string())

In [None]:
###########################
# ADD LANG_POLYGLOT FIELD #
###########################

from polyglot.text import Text, Word

def load_tweets(tweet_collection):

    pipeline = [
                {
                    '$match': {
                        'LANG_POLYGLOT': {            
                            '$exists': False
                        },
                    }
                }, {
                    '$project': {
                        '_id': True, 
                        'text': True
                    }
                }
            ]
    print("Query", end=" ")
    users = tweet_collection.aggregate(pipeline, allowDiskUse=True)
    print("OK; List", end=" ")
    users = list(users)
    print("OK; Total users:", len(users))
    return users

tweets = load_tweets(tweetCollection)

LANG_POLYGLOT = ""

for result in progress_bar(tweets):

    text = Text(result['text'])
    LANG_POLYGLOT = text.language.code
    
    try:
        tweetCollection.update_one(
                                    {'_id': result['_id']},
                                    {'$set': 
                                        {
                                            'LANG_POLYGLOT': LANG_POLYGLOT
                                        }
                                    },
                                    upsert=False,
                                  )
    except Exception as e:
        logging.error(e)
        logging.error("Fatal exception inserting users in MongoDB")

In [None]:
################################
# TWEETS COMPARING LANG VALUES #
################################

def load_tweets(collection):
    """Extracts the tweet bot interaction information
    
    Keyword arguments:
    collection -- MongoDB Tweets' Collection
    """
    pipeline = [
                {
                    '$project': {
                        'lang': True,
                        'LANG_POLYGLOT': True
                    }
                }, {
                    '$group': {
                        '_id': {'lang':'$lang', 'LANG_POLYGLOT':'$LANG_POLYGLOT'}, 
                        'count': {
                            '$sum': 1
                        }
                    }
                }
            ]
    print("Query", end=" ")
    tweets = collection.aggregate(pipeline, allowDiskUse=True)
    print("OK; List", end=" ")
    tweets = list(tweets)
    print("OK; Total combinations:", len(tweets))
    return tweets

tweets = load_tweets(tweetCollection)
tweets_df = pd.DataFrame(tweets)
tweets_df2 = pd.concat([pd.json_normalize(tweets_df['_id']), tweets_df['count']], axis=1)

totalTweets = tweets_df2['count'].sum()
print("TOTAL TWEETS: " + str(totalTweets))

print(tweets_df2.sort_values('count', ascending=False).to_string(index=False))

In [None]:
#########################################################
# DELETE TWEETS NOT IN ESP AND WITH DIFF LANG DETECTION #
#########################################################

def load_tweets(collection):
    """Extracts the tweet bot interaction information
    
    Keyword arguments:
    collection -- MongoDB Tweets' Collection
    """
    pipeline = [
                {
                    '$project': {
                        '_id': True,
                        'lang': True,
                        'LANG_POLYGLOT': True
                    }
                }
            ]
    print("Query", end=" ")
    tweets = collection.aggregate(pipeline, allowDiskUse=True)
    print("OK; List", end=" ")
    tweets = list(tweets)
    print("OK; Total combinations:", len(tweets))
    return tweets

tweets = load_tweets(tweetCollection)

for result in progress_bar(tweets):
    if result['lang'] not in ['es']:
        try:
            tweetCollection.delete_one({'_id': result['_id']})
        except Exception as e:
            logging.error(e)
            logging.error("Fatal exception deleting document in MongoDB")
    elif result['LANG_POLYGLOT'] not in ['es']:
        try:
            tweetCollection.delete_one({'_id': result['_id']})
        except Exception as e:
            logging.error(e)
            logging.error("Fatal exception deleting document in MongoDB")
    elif result['lang'] != result['LANG_POLYGLOT']:
        try:
            tweetCollection.delete_one({'_id': result['_id']})
        except Exception as e:
            logging.error(e)
            logging.error("Fatal exception deleting document in MongoDB")

In [None]:
######################
# ADD POLARITY FIELD #
######################

from polyglot.text import cached_property, Text
class TextOverride(Text):
    @cached_property
    def polarity(self):
        """Return the polarity score as a float within the range [-1.0, 1.0]
        """
        scores = [w.polarity for w in self.words if w.polarity != 0]
        if len(scores) == 0:
            return 0.0
        return sum(scores) / float(len(scores))
    
def load_tweets(tweet_collection):

    pipeline = [
                {
                    '$match': {
                        'POLARITY': {            
                            '$exists': False
                        }
                    }
                }, {
                    '$project': {
                        '_id': True, 
                        'text': True
                    }
                }
            ]
    print("Query", end=" ")
    users = tweet_collection.aggregate(pipeline, allowDiskUse=True)
    print("OK; List", end=" ")
    users = list(users)
    print("OK; Total tweets:", len(users))
    return users

tweets = load_tweets(tweetCollection)

POLARITY = 0
POLARITY_TAG = ""

for result in progress_bar(tweets):

    text = Text(result['text'])
    
    #POLARITY = text.polarity
    scores = [w.polarity for w in text.words if w.polarity != 0]
    if len(scores) != 0:
        POLARITY = sum(scores) / float(len(scores))
    
    if POLARITY == 0:
        POLARITY_TAG = "NEUTRAL"
    elif POLARITY > 0:
        POLARITY_TAG = "POSITIVE"
    else:
        POLARITY_TAG = "NEGATIVE"
    
    try:
        tweetCollection.update_one(
                                    {'_id': result['_id']},
                                    {'$set': 
                                        {
                                            'POLARITY': POLARITY,
                                            'POLARITY_TAG': POLARITY_TAG
                                        }
                                    },
                                    upsert=False,
                                  )
    except Exception as e:
        logging.error(e)
        logging.error("Fatal exception inserting users in MongoDB")

In [None]:
########################
# ANALYZE POLARITY_TAG #
########################

def load_tweets(collection):

    pipeline = [
                {
                    '$project': {
                        'POLARITY_TAG': True
                    }
                }, {
                    '$group': {
                        '_id': {'POLARITY_TAG':'$POLARITY_TAG'}, 
                        'count': {
                            '$sum': 1
                        }
                    }
                }
            ]
    print("Query", end=" ")
    tweets = collection.aggregate(pipeline, allowDiskUse=True)
    print("OK; List", end=" ")
    tweets = list(tweets)
    print("OK; Total combinations:", len(tweets))
    return tweets

tweets = load_tweets(tweetCollection)
tweets_df = pd.DataFrame(tweets)
tweets_df2 = pd.concat([pd.json_normalize(tweets_df['_id']), tweets_df['count']], axis=1)

totalTweets = tweets_df2['count'].sum()
print("TOTAL TWEETS: " + str(totalTweets))

print(tweets_df2.sort_values('count', ascending=False).to_string(index=False))

tweets_df3 = tweets_df2.groupby(['POLARITY_TAG']).agg({'count': 'sum'})
tweets_df4 = tweets_df3.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))

print(tweets_df4.sort_values('POLARITY_TAG'))

In [None]:
###################################
# ANALYZE POLARITY_TAG + POLARITY #
###################################

def load_tweets(collection):
    """Extracts the tweet bot interaction information
    
    Keyword arguments:
    collection -- MongoDB Tweets' Collection
    """
    pipeline = [
                {
                    '$project': {
                        'POLARITY_TAG': True,
                        'POLARITY': True
                    }
                }, {
                    '$group': {
                        '_id': {'POLARITY_TAG':'$POLARITY_TAG', 'POLARITY':'$POLARITY'}, 
                        'count': {
                            '$sum': 1
                        }
                    }
                }
            ]
    print("Query", end=" ")
    tweets = collection.aggregate(pipeline, allowDiskUse=True)
    print("OK; List", end=" ")
    tweets = list(tweets)
    print("OK; Total combinations:", len(tweets))
    return tweets

tweets = load_tweets(tweetCollection)
tweets_df = pd.DataFrame(tweets)
tweets_df2 = pd.concat([pd.json_normalize(tweets_df['_id']), tweets_df['count']], axis=1)

totalTweets = tweets_df2['count'].sum()
print("TOTAL TWEETS: " + str(totalTweets))

print(tweets_df2.sort_values('count', ascending=False).to_string(index=False))

tweets_df3 = tweets_df2.groupby(['POLARITY_TAG', 'POLARITY']).agg({'count': 'sum'})
tweets_df4 = tweets_df3.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))

print(tweets_df4.sort_values('POLARITY_TAG'))

In [None]:
###################################
# ANALYZE COMMUNITY + POLARITY_TAG #
###################################

def load_tweets(collection):
    """Extracts the tweet bot interaction information
    
    Keyword arguments:
    collection -- MongoDB Tweets' Collection
    """
    pipeline = [
                {
                    '$project': {
                        'ACTIVE_community': True,
                        'PASSIVE_community': True,
                        'POLARITY_TAG': True
                    }
                }, {
                    '$group': {
                        '_id': {'ACTIVE_community':'$ACTIVE_community', 'PASSIVE_community':'$PASSIVE_community','POLARITY_TAG':'$POLARITY_TAG'}, 
                        'count': {
                            '$sum': 1
                        }
                    }
                }
            ]
    print("Query", end=" ")
    tweets = collection.aggregate(pipeline, allowDiskUse=True)
    print("OK; List", end=" ")
    tweets = list(tweets)
    print("OK; Total combinations:", len(tweets))
    return tweets

tweets = load_tweets(tweetCollection)
tweets_df = pd.DataFrame(tweets)
tweets_df2 = pd.concat([pd.json_normalize(tweets_df['_id']), tweets_df['count']], axis=1)

totalTweets = tweets_df2['count'].sum()
print("TOTAL TWEETS: " + str(totalTweets))

print(tweets_df2.sort_values('count', ascending=False).to_string(index=False))

In [None]:
###################################
# ANALYZE COMMUNITY + POLARITY_TAG #
###################################

def load_tweets(collection):
    """Extracts the tweet bot interaction information
    
    Keyword arguments:
    collection -- MongoDB Tweets' Collection
    """
    pipeline = [
                {
                    '$project': {
                        'ACTIVE_community': True,
                        'POLARITY_TAG': True
                    }
                }, {
                    '$group': {
                        '_id': {'ACTIVE_community':'$ACTIVE_community', 'POLARITY_TAG':'$POLARITY_TAG'}, 
                        'count': {
                            '$sum': 1
                        }
                    }
                }
            ]
    print("Query", end=" ")
    tweets = collection.aggregate(pipeline, allowDiskUse=True)
    print("OK; List", end=" ")
    tweets = list(tweets)
    print("OK; Total combinations:", len(tweets))
    return tweets

tweets = load_tweets(tweetCollection)
tweets_df = pd.DataFrame(tweets)
tweets_df2 = pd.concat([pd.json_normalize(tweets_df['_id']), tweets_df['count']], axis=1)

totalTweets = tweets_df2['count'].sum()
print("TOTAL TWEETS: " + str(totalTweets))

print(tweets_df2.sort_values('count', ascending=False).to_string(index=False))

tweets_df3 = tweets_df2.groupby(['ACTIVE_community', 'POLARITY_TAG']).agg({'count': 'sum'})
tweets_df4 = tweets_df3.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))

print(tweets_df4.sort_values('ACTIVE_community'))