In [None]:
import tweepy
import json
from pymongo import MongoClient
from collections import Counter
import logging
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx
import community
import seaborn as sns
from fastprogress import master_bar, progress_bar

#Logger
logging.basicConfig(filename='DataMetrics.log', format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
logging.getLogger().addHandler(ch)

In [None]:
##############
# PARAMETERS #
##############

client = MongoClient(username='XXX', password='XXX')

DatabaseName = "Streaming"
CollectionName = "Campanya"

In [None]:
######################
# COLLECTION METRICS #
######################

db = client[DatabaseName]
tweets = db[CollectionName].find()
print("Twits Total: {}".format(tweets.count()))

rts=0
quotes=0
replies=0
simples=0

for tweet in tweets:
    if "retweeted_status" in tweet:
        rts+=1
    elif tweet["is_quote_status"]:
        quotes+=1
    elif tweet["in_reply_to_status_id"] is not None:
        replies+=1
    else:
        simples+=1
               
print("RTs: " + str(rts))
print("Quote: " + str(quotes))
print("Reply: " + str(replies))
print("Simples: " + str(simples))

In [None]:
#####################
# CONVERT TO PANDAS #
#####################

db = client[DatabaseName]
tweets = db[CollectionName].find({}, {'id':1, 'created_at':1, 'user.id':1}, no_cursor_timeout=True, batch_size=1000000)
print("Twits Total: {}".format(tweets.count()))

tweets_DF = pd.DataFrame(list(tweets))
tweets.close()

print("Number of different tweets:", tweets_DF['id'].nunique())
print("Number of different users:", tweets_DF['user'].apply(lambda x: x.get('id')).nunique())

In [None]:
###############################
# PLOT TIME ANALYSIS  (NICE!) #
###############################

import matplotlib.patches as mpatches
import datetime as dt

'''
# color palette
colors = ["#C44E52", #red
          "#55A868", #green
          "#FFC400", #yellow
          "#4C72B0", #blue
          "#DD8452", #orange
          "#8172B3", #purple
          "#64B5CD", #cyan
          "#937860", #brown
          "#8C8C8C", #gray
         ]
'''

tweets_DF['created_at'] = pd.to_datetime(tweets_DF['created_at']).dt.tz_convert('Europe/Madrid')
tweets_DF['created_at2'] = pd.to_datetime(tweets_DF['created_at'], infer_datetime_format = "%d/%m/%Y", utc  = False)
tweets_DF['date'] = tweets_DF.created_at2.dt.date   # we extract the date (year-day-month) from timestamp (created_at)
tweets_DF.head(2)

start = min(tweets_DF['date'])     # first day
end = max(tweets_DF['date'])       # last day

# count of interactions per day
df_dates  = pd.DataFrame(tweets_DF.groupby('date').agg({'_id':'count'}))
df_dates .reset_index(inplace=True)

colors = []
df_dates2 = df_dates.sort_values(by='_id',axis='index',ascending=False).reset_index().sort_values(by='date',axis='index')
for d in df_dates2.date: 
    if d < dt.datetime(2021, 5, 3).date():
        colors.append("#55A868")
    elif d < dt.datetime(2021, 5, 4).date():
        colors.append("#4C72B0")
    else:
        colors.append("#C44E52")
        
''' ANDALUSIA DATES
for d in df_dates2.date: 
    if d < dt.datetime(2022, 6, 18).date():
        colors.append("#55A868")
    elif d < dt.datetime(2022, 6, 19).date():
        colors.append("#4C72B0")
    else:
        colors.append("#C44E52")
'''
        
plt.figure(figsize=(15,5))
ax=sns.barplot(x=df_dates.date, y=df_dates._id, palette=colors)
ax.xaxis.set_tick_params(rotation=75)
plt.tight_layout()
plt.xlabel('Date')
plt.ylabel('Number of tweets')
plt.title("Tweet distribution grouped by day")

# Create legend.
green_patch = mpatches.Patch(color='#55A868', label='Campaign')
blue_patch = mpatches.Patch(color='#4C72B0', label='Reflection Journey')
red_patch = mpatches.Patch(color='#C44E52', label='Election Day')
plt.legend(handles=[blue_patch, green_patch, red_patch], loc="upper left")

#plt.savefig(GRAPHICS_DIR + "total-traffic-timeline" + FIG_EXTENSION,bbox_inches = "tight")
plt.show()

In [None]:
#####################
# RT-QUOTE COVERAGE #
#####################

db = client[DatabaseName]
tweets = db[CollectionName].find(no_cursor_timeout=True, batch_size=1000000)
print("Twits Total: {}".format(tweets.count()))

original_twit = ""

dictRTCoverage = {}
newRTValue = 0
counterRT = 0
oldMaxRT = 0

dictQuoteCoverage = {}
newQuoteValue = 0
counterQuote = 0
oldMaxQuote = 0

for tweet in tweets:
    if "retweeted_status" in tweet:
        original_twit = tweet['retweeted_status'] 
        newRTValue = original_twit["retweet_count"]
          
        if(original_twit["id"] in dictRTCoverage):
            counterRT = dictRTCoverage[original_twit["id"]][0] + 1
            oldMaxRT = dictRTCoverage[original_twit["id"]][1]
            dictRTCoverage[original_twit["id"]] = [counterRT, max(oldMaxRT, newRTValue)]
        else:
            dictRTCoverage[original_twit["id"]] = [1, newRTValue]
        
    elif tweet['is_quote_status']:
        if('quoted_status' in tweet):
            original_twit = tweet['quoted_status'] 
            if('quote_count' in original_twit): 
                newQuoteValue = original_twit["quote_count"]

                if(original_twit["id"] in dictQuoteCoverage):
                    counterQuote = dictQuoteCoverage[original_twit["id"]][0] + 1
                    oldMaxQuote = dictQuoteCoverage[original_twit["id"]][1]
                    dictQuoteCoverage[original_twit["id"]] = [counterQuote, max(oldMaxQuote, newQuoteValue)]
                else:
                    dictQuoteCoverage[original_twit["id"]] = [1, newQuoteValue]

tweets.close()

totalRTCaptured = 0
maxRTCaptured = 0

for key, value in dictRTCoverage.items():
    totalRTCaptured += value[0]
    maxRTCaptured += value[1]

print("RTs Captured: {}".format(totalRTCaptured))
print("Max RTs Captured: {}".format(maxRTCaptured))
print("RT Coverage: {:0.2f}%".format(totalRTCaptured/maxRTCaptured*100))

totalQuoteCaptured = 0
maxQuoteCaptured = 0

for key, value in dictQuoteCoverage.items():
    totalQuoteCaptured += value[0]
    maxQuoteCaptured += value[1]
    
print("Quotes Captured: {}".format(totalQuoteCaptured))
print("Max Quotes Captured: {}".format(maxQuoteCaptured))
print("Quote Coverage: {:0.2f}%".format(totalQuoteCaptured/maxQuoteCaptured*100))

In [None]:
####################
# ACCOUNT CREATION #
####################

db = client[DatabaseName]
tweets = db[CollectionName].find({}, {'id':1, 'user.id':1, 'user.created_at':1},no_cursor_timeout=True, batch_size=1000000)
print("Twits Total: {}".format(tweets.count()))

tweets_DF_withDuplicates = pd.DataFrame(list(tweets))
tweets.close()
print("Number of different users:", tweets_DF_withDuplicates['user'].apply(lambda x: x.get('id')).nunique())
tweets_DF_withDuplicates['userId'] = tweets_DF_withDuplicates['user'].apply(lambda x: x.get('id'))

tweets_DF = tweets_DF_withDuplicates.drop_duplicates(['userId'])
print("Number of different users:", tweets_DF['user'].apply(lambda x: x.get('id')).nunique())
print("Number of rows in DF without duplicates:", tweets_DF.shape[0])

tweets_DF['user_created_at'] = pd.to_datetime(tweets_DF['user'].apply(lambda x: x.get('created_at')), infer_datetime_format = "%Y", utc  = False)
tweets_DF['creation_date'] = tweets_DF.user_created_at.dt.year   # we extract the date (year) from timestamp (created_at)

start = min(tweets_DF['creation_date'])     # first year
end = max(tweets_DF['creation_date'])       # last year

# count of interactions per day
df_dates  = pd.DataFrame(tweets_DF.groupby('creation_date').agg({'_id':'count'}))
df_dates .reset_index(inplace=True)

# plot traffic per day, blue color according to traffic volume
pal = sns.color_palette("Blues_d", len(df_dates))
pal2 = []
df_dates2 = df_dates.sort_values(by='_id',axis='index',ascending=False).reset_index().sort_values(by='creation_date',axis='index')
for _id in df_dates2.index:
    pal2.append(pal[_id])

plt.figure(figsize=(15,5))
ax=sns.barplot(x=df_dates.creation_date, y=df_dates._id, palette=pal2)
ax.xaxis.set_tick_params(rotation=75)
plt.tight_layout()
plt.xlabel('Year')
plt.ylabel('Number of accounts')
plt.title("Number of accounts by creation year")
#plt.savefig(GRAPHICS_DIR + "total-traffic-timeline" + FIG_EXTENSION,bbox_inches = "tight")
plt.show()

In [None]:
#####################
# CUENTAS MATRÍCULA #
#####################

def is_Matricula (userName):
    if(len(userName)>8):
        for char in userName[-8:]:
            if(not char.isdigit()):
                return False
        return True
    return False

db = client[DatabaseName]
tweets = db[CollectionName].find({}, {'id':1, 'user.id':1, 'user.screen_name':1, 'user.created_at':1})
print("Twits Total: {}".format(tweets.count()))

tweets_DF_withDuplicates = pd.DataFrame(list(tweets))
print("Number of different users:", tweets_DF_withDuplicates['user'].apply(lambda x: x.get('id')).nunique())
tweets_DF_withDuplicates['userId'] = tweets_DF_withDuplicates['user'].apply(lambda x: x.get('id'))

tweets_DF = tweets_DF_withDuplicates.drop_duplicates(['userId'])
tweets_DF['userName'] = tweets_DF_withDuplicates['user'].apply(lambda x: x.get('screen_name'))
tweets_DF['isMatricula'] = tweets_DF['userName'].apply(lambda x: is_Matricula(x))
tweets_DF_Matricula = tweets_DF.loc[tweets_DF['isMatricula'] == True]

tweets_DF_Matricula['user_created_at'] = pd.to_datetime(tweets_DF_Matricula['user'].apply(lambda x: x.get('created_at')), infer_datetime_format = "%Y", utc  = False)
tweets_DF_Matricula['creation_date'] = tweets_DF_Matricula.user_created_at.dt.year   # we extract the date (year) from timestamp (created_at)

start = min(tweets_DF_Matricula['creation_date'])     # first year
end = max(tweets_DF_Matricula['creation_date'])       # last year

# count of interactions per day
df_dates  = pd.DataFrame(tweets_DF_Matricula.groupby('creation_date').agg({'_id':'count'}))
df_dates .reset_index(inplace=True)

# plot traffic per day, blue color according to traffic volume
pal = sns.color_palette("Blues_d", len(df_dates))
pal2 = []
df_dates2 = df_dates.sort_values(by='_id',axis='index',ascending=False).reset_index().sort_values(by='creation_date',axis='index')
for _id in df_dates2.index:
    pal2.append(pal[_id])

plt.figure(figsize=(15,5))
ax=sns.barplot(x=df_dates.creation_date, y=df_dates._id, palette=pal2)
ax.xaxis.set_tick_params(rotation=75)
plt.tight_layout()
plt.xlabel('Year')
plt.ylabel('Number of accounts')
plt.title("Cuentas matricula by creation year")
#plt.savefig(GRAPHICS_DIR + "total-traffic-timeline" + FIG_EXTENSION,bbox_inches = "tight")
plt.show()

In [None]:
##################
# BUILDING GRAPH #
##################

FILE_NAME = "4M-TOTAL.graphml"

G = nx.DiGraph()

db = client[DatabaseName]

tweets = db[CollectionName].find(no_cursor_timeout=True, batch_size=1000000)

for result in tweets:
            uid = result['user']['screen_name']
            G.add_node(uid)

            if 'retweeted_status' in result:
                if G.has_edge(uid, result['retweeted_status']['user']['screen_name']):
                    G[uid][result['retweeted_status']['user']['screen_name']]['weight'] += 1.0
                else:
                    G.add_edge(uid, result['retweeted_status']['user']['screen_name'], weight = 1.0) 
            elif result['is_quote_status']:
                if 'quoted_status' in result:
                    if G.has_edge(uid, result['quoted_status']['user']['screen_name']):
                        G[uid][result['quoted_status']['user']['screen_name']]['weight'] += 1.0
                    else:
                        G.add_edge(uid,result['quoted_status']['user']['screen_name'], weight=1.0)
            elif result['in_reply_to_status_id'] is not None:
                if G.has_edge(uid, result['in_reply_to_screen_name']):
                    G[uid][result['in_reply_to_screen_name']]['weight'] += 1.0
                else:
                    G.add_edge(uid,result['in_reply_to_screen_name'], weight=1.0)

tweets.close()                    
                    
print("Nombre de nodes: {}".format(G.number_of_nodes()))
print("Nombre d'arestes: {}".format(G.number_of_edges()))
    
nx.write_graphml(G, FILE_NAME)

In [None]:
#################
# GRAPH METRICS #
#################

#Analitzem alguns valors genèrics del graf, tot i que l'anàlisi es realitzarà principalment a Gephi

part = community.best_partition(G.to_undirected())
mod = community.modularity(part, G.to_undirected())

print("S'han detectat {} comunitats: ".format(len(set(part.values()))))

comunitats_mes_grans = Counter(part.values()).most_common(5)
print("Les comunitats més grans són: {}".format(comunitats_mes_grans))

n = G.number_of_nodes()
a = {k: str(round(float(v/n*100),2))+"%" for k, v in comunitats_mes_grans}
print("Les comunitats més grans tenen les següents proporcions: {}".format(a))

print("La modularitat és: " + str(mod))

degrees = [d for _, d in G.degree()]

# Mostrem estadístiques sobre els graus.
print('El grau màxim és: {}'.format(max(degrees)))
print('El grau mínim és: {}'.format(min(degrees)))
print('La mitjana dels graus del graf és: {}'.format(np.mean(degrees)))
print('La mediana dels graus del graf és: {}'.format(np.median(degrees)))

#Analitzem també valors importants per a valorar quins són els nodes amb més centralitat
#Especialment importants són els usuaris amb més grau de sortida, ja que seran analitzats posteriorment per a detectar possibles bots o comportaments extranys
centralitat_grau = nx.degree_centrality(G)
sorted_g = sorted(centralitat_grau.items(), key=lambda i: i[1], reverse=True)[:20]
print("Usuaris amb més Centralitat de grau:")
print(sorted_g)

indeg = G.in_degree(weight='weight')
sorted_indeg = sorted(indeg, key=lambda i: i[1], reverse=True)[:20]
print("Usuaris amb més grau d'ENTRADA:")
print(sorted_indeg)

outdeg = G.out_degree(weight='weight')
sorted_outdeg = sorted(outdeg, key=lambda i: i[1], reverse=True)[:50]
print("Usuaris amb més grau de SORTIDA:")
print(sorted_outdeg)