In [None]:
import tweepy
import json
from pymongo import MongoClient
from collections import Counter
from collections.abc import MutableMapping
import logging
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx
import community
import seaborn as sns
from fastprogress import master_bar, progress_bar
import time
import re


#Logger
logging.basicConfig(filename='Anàlisi-GetTweetInteractions.log', format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
logging.getLogger().addHandler(ch)

In [None]:
##############
# PARAMETERS #
##############

client = MongoClient(username='XXX', password='XXX')

DatabaseName = "Streaming"
TweetCollectionName = "Campanya-Interactions"
UserCollectionName = "Users"

db = client[DatabaseName]
tweetCollection = db[TweetCollectionName]

In [None]:
###############################################
# GET ACTIVE/PASIVE USER + DROP SIMPLE TWEETS #
###############################################

def load_users(user_collection):

    pipeline = [
                {
                    '$project': {
                        '_id': True, 
                        'community': True
                    }
                }
            ]
    print("Query", end=" ")
    users = user_collection.aggregate(pipeline, allowDiskUse=True)
    print("OK; List", end=" ")
    users = list(users)
    print("OK; Total users:", len(users))
    return users

tweetCollection = db[TweetCollectionName]

tweets = tweetCollection.find(no_cursor_timeout=True, batch_size=1000000)
users = load_users(db[UserCollectionName])
df_users = pd.DataFrame(users)
df_users = df_users.set_index('_id')

print(df_users)

active_id = 0
passive_id = 0

for result in tweets:
    simple_tweet = False #Initially we consider it false
    active_id = result['user']['id']

    if 'retweeted_status' in result:
        passive_id = result['retweeted_status']['user']['id']
    elif result['is_quote_status']:
        if 'quoted_status' in result:
            passive_id = result['quoted_status']['user']['id']
    elif result['in_reply_to_status_id'] is not None:
        passive_id = result['in_reply_to_user_id']
    else:
        simple_tweet = True
    
    if simple_tweet:
        ### DELETE THE TWEET IF IT IS SIMPLE
        try:
            tweetCollection.delete_one({'_id': result['_id']})
        except Exception as e:
            logging.error(e)
            logging.error("Fatal exception deleting document in MongoDB")
    else:
        ### Get Active/Passive community + update the record
        active_community = df_users.loc[active_id, 'community']
        
        try:
            passive_community = df_users.loc[passive_id, 'community']
        except Exception as e:
            logging.error("Passive Reply user not extracted. ERROR!!!")
            passive_community = "ERROR"
            passive_is_bot = "ERROR"

        try:
            tweetCollection.update_one(
                                        {'_id': result['_id']},
                                        {'$set': 
                                            {
                                                'ACTIVE_community': active_community,
                                                'PASSIVE_community': passive_community
                                            }
                                        },
                                        upsert=False,
                                      )
        except Exception as e:
            logging.error(e)
            logging.error("Fatal exception inserting users in MongoDB")
            
        
tweets.close() 

In [None]:
##########################################
# EXTRACT DATA INTERACTIONS BY COMMUNITY #
##########################################

def load_tweets(collection):
    """Extracts the tweet community interaction information
    
    Keyword arguments:
    collection -- MongoDB Tweets' Collection
    """
    pipeline = [
                {
                    '$project': {
                        'ACTIVE_community': True,
                        'PASSIVE_community': True
                    }
                }, {
                    '$group': {
                        '_id': {'ACTIVE_community':'$ACTIVE_community', 'PASSIVE_community':'$PASSIVE_community'}, 
                        'count': {
                            '$sum': 1
                        }
                    }
                }
            ]
    print("Query", end=" ")
    tweets = collection.aggregate(pipeline, allowDiskUse=True)
    print("OK; List", end=" ")
    tweets = list(tweets)
    print("OK; Total combinations:", len(tweets))
    return tweets

tweets = load_tweets(tweetCollection)
tweets_df = pd.DataFrame(tweets)
tweets_df2 = pd.concat([pd.json_normalize(tweets_df['_id']), tweets_df['count']], axis=1)

totalTweets = tweets_df2['count'].sum()
print("TOTAL TWEETS: " + str(totalTweets))

print(tweets_df2.sort_values('count', ascending=False).head(30).to_string(index=False))

tweets_df3 = tweets_df2.groupby(['ACTIVE_community', 'PASSIVE_community']).agg({'count': 'sum'})
tweets_df4 = tweets_df3.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))

print(tweets_df4.sort_values('ACTIVE_community').to_string())

In [None]:
########################
# PREPARE HEATMAP DATA #
########################

#MADRID
COMMUNITIES_LIST = ["VOX", "PODEMOS", "PSOE", "PP", "MAS_MAD", "CS"]

#ANDALUSIA
# COMMUNITIES_LIST = ["VOX", "POR_AND", "PSOE", "PP", "ADELANTE_AND", "CS"]

def load_tweets(collection):
    """Extracts the tweet community interaction information
    
    Keyword arguments:
    collection -- MongoDB Tweets' Collection
    """
    pipeline = [
                {
                    '$project': {
                        'ACTIVE_community': True,
                        'PASSIVE_community': True
                    }
                }, {
                    '$group': {
                        '_id': {'ACTIVE_community':'$ACTIVE_community', 'PASSIVE_community':'$PASSIVE_community'}, 
                        'count': {
                            '$sum': 1
                        }
                    }
                }
            ]
    print("Query", end=" ")
    tweets = collection.aggregate(pipeline, allowDiskUse=True)
    print("OK; List", end=" ")
    tweets = list(tweets)
    print("OK; Total combinations:", len(tweets))
    return tweets

tweets = load_tweets(tweetCollection)
tweets_df = pd.DataFrame(tweets)
tweets_df2 = pd.concat([pd.json_normalize(tweets_df['_id']), tweets_df['count']], axis=1)

totalTweets = tweets_df2['count'].sum()
print("TOTAL TWEETS: " + str(totalTweets))

tweets_df3 = tweets_df2[tweets_df2['ACTIVE_community'].isin(COMMUNITIES_LIST)]
tweets_df4 = tweets_df3[tweets_df3['PASSIVE_community'].isin(COMMUNITIES_LIST)]

#print(tweets_df4.sort_values('count', ascending=False).to_string(index=False))

tweets_df5 = tweets_df4.groupby(['ACTIVE_community', 'PASSIVE_community']).agg({'count': 'sum'})
tweets_df_test = tweets_df5.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))

print(tweets_df_test.sort_values('ACTIVE_community').to_string())

tweets_df_test = tweets_df_test.reset_index()  
print(tweets_df_test)

In [None]:
##########################
# PREPARE HEATMAP DATA 2 #
##########################

#print(tweets_df_test.index[0])
#print(tweets_df_test.name)
index_active = COMMUNITIES_LIST
index_passive = COMMUNITIES_LIST

total_index = list(set(index_active+index_passive))

#print(index_active)
#print(index_passive)

total_index_clean = [x for x in total_index if str(x) != 'nan']

final_index = sorted(total_index_clean)

print(final_index)

matrix = np.zeros((len(final_index),len(final_index)))

for active in final_index:
    for passive in final_index:
        #print(active)
        #print(passive)
        value = tweets_df_test[(tweets_df_test["ACTIVE_community"]==active) & (tweets_df_test["PASSIVE_community"]==passive)]["count"]
        matrix[final_index.index(active)][final_index.index(passive)] = value

print(matrix)

In [None]:
################
# PLOT HEATMAP #
################

def plot_heatmap(m, x_values, y_values, title, xlabel, ylabel, save_fig=False, label_rotation=None):
    """
        Creates a heatmap image from a numpy matrix.

    :param m: 2-dimensional numpy matrix with values to plot
    :param x_values: list of strings for xticks
    :param y_values: list of strings for yticks
    :param title: string, title of the plot
    :param xlabel: string, label of the x axis
    :param ylabel: string, label of the y axis
    :param save_fig: False / "show" / figname , do not show imatge / show it inline / write it to pdf (figure name)
    :param label_rotation: None / int, whether to rotate x ticks (degrees)
    :return:
    """
    
    fig = plt.figure(figsize=(20, 20))
    ax = fig.add_subplot(1, 1, 1)

    im = ax.imshow(m.transpose(), origin='lower', cmap='Reds', alpha=0.7, aspect='auto')
    #im = ax.imshow(m.transpose(), origin='lower', cmap='jet', alpha=0.7)

    # Colorbar
    cbar = ax.figure.colorbar(im, ax=ax)

    # Loop over data dimensions and create text annotations.
    mt = m.transpose()
    for i in range(len(x_values)):
        for j in range(len(y_values)):
            text = ax.text(j, i, "{:.2f}%".format(mt[i, j]), ha="center", va="center", color="black", alpha=1, fontsize=18)

    # Named ticks
    ax.set_xticks(np.arange(len(x_values)))
    ax.set_yticks(np.arange(len(y_values)))
    ax.set_xticklabels(x_values)
    ax.set_yticklabels(y_values)

    # Axis labels
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)

    # Rotate x labels
    if label_rotation:
        plt.xticks(rotation=label_rotation)

    fig.tight_layout()
    plt.title(title)
    
    plt.rcParams.update({'font.size': 24})

    # Output result
    if not save_fig:
        pass
    elif save_fig == "show":
        plt.show()
    else:
        plt.savefig(save_fig + '.pdf', format='pdf', dpi=600)
        plt.close()

        
######INPUT DATA#####

m = matrix
x_values = final_index
y_values = final_index
title = "Community interactions"
xlabel = "Passive"
ylabel = "Active"
save_fig = "show"

plot_heatmap(m, x_values, y_values, title, xlabel, ylabel, save_fig, 60)