In [None]:
import tweepy
import json
from pymongo import MongoClient
from collections import Counter
import logging
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx
import community
import seaborn as sns
from fastprogress import master_bar, progress_bar

#Logger
logging.basicConfig(filename='DataMetrics.log', format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
logging.getLogger().addHandler(ch)

In [None]:
##############
# PARAMETERS #
##############

client = MongoClient(username='XXX', password='XXX')

DatabaseName = "Hashtags"
CollectionName = "Campanya"

In [None]:
##################
# BUILDING GRAPH (RT) #
##################

FILE_NAME = "4M-Campanya-RT.graphml"

G = nx.DiGraph()

db = client[DatabaseName]

tweets = db[CollectionName].find(no_cursor_timeout=True, batch_size=1000000)

for result in tweets:
            uid = result['user']['screen_name']
            G.add_node(uid)

            if 'retweeted_status' in result:
                if G.has_edge(uid, result['retweeted_status']['user']['screen_name']):
                    G[uid][result['retweeted_status']['user']['screen_name']]['weight'] += 1.0
                else:
                    G.add_edge(uid, result['retweeted_status']['user']['screen_name'], weight = 1.0) 
            '''elif result['is_quote_status']:
                if 'quoted_status' in result:
                    if G.has_edge(uid, result['quoted_status']['user']['screen_name']):
                        G[uid][result['quoted_status']['user']['screen_name']]['weight'] += 1.0
                    else:
                        G.add_edge(uid,result['quoted_status']['user']['screen_name'], weight=1.0)
            elif result['in_reply_to_status_id'] is not None:
                if G.has_edge(uid, result['in_reply_to_screen_name']):
                    G[uid][result['in_reply_to_screen_name']]['weight'] += 1.0
                else:
                    G.add_edge(uid,result['in_reply_to_screen_name'], weight=1.0)
            '''
tweets.close()                    
                    
print("Nombre de nodes: {}".format(G.number_of_nodes()))
print("Nombre d'arestes: {}".format(G.number_of_edges()))
    
nx.write_graphml(G, FILE_NAME)

In [None]:
##################
# BUILDING GRAPH (Quote) #
##################

FILE_NAME = "4M-Campanya-Quote.graphml"

G = nx.DiGraph()

db = client[DatabaseName]

tweets = db[CollectionName].find(no_cursor_timeout=True, batch_size=1000000)

for result in tweets:
            uid = result['user']['screen_name']
            G.add_node(uid)

            if result['is_quote_status']:
                if 'quoted_status' in result:
                    if G.has_edge(uid, result['quoted_status']['user']['screen_name']):
                        G[uid][result['quoted_status']['user']['screen_name']]['weight'] += 1.0
                    else:
                        G.add_edge(uid,result['quoted_status']['user']['screen_name'], weight=1.0)

tweets.close()                    
                    
print("Nombre de nodes: {}".format(G.number_of_nodes()))
print("Nombre d'arestes: {}".format(G.number_of_edges()))
    
nx.write_graphml(G, FILE_NAME)

In [None]:
##################
# BUILDING GRAPH (Reply) #
##################

FILE_NAME = "4M-Campanya-Reply.graphml"

G = nx.DiGraph()

db = client[DatabaseName]

tweets = db[CollectionName].find(no_cursor_timeout=True, batch_size=1000000)

for result in tweets:
            uid = result['user']['screen_name']
            G.add_node(uid)

            if result['in_reply_to_status_id'] is not None:
                if G.has_edge(uid, result['in_reply_to_screen_name']):
                    G[uid][result['in_reply_to_screen_name']]['weight'] += 1.0
                else:
                    G.add_edge(uid,result['in_reply_to_screen_name'], weight=1.0)
            
tweets.close()                    
                    
print("Nombre de nodes: {}".format(G.number_of_nodes()))
print("Nombre d'arestes: {}".format(G.number_of_edges()))
    
nx.write_graphml(G, FILE_NAME)

In [None]:
#################
# GRAPH METRICS #
#################

#Analitzem alguns valors genèrics del graf, tot i que l'anàlisi es realitzarà principalment a Gephi

part = community.best_partition(G.to_undirected())
mod = community.modularity(part, G.to_undirected())

print("S'han detectat {} comunitats: ".format(len(set(part.values()))))

comunitats_mes_grans = Counter(part.values()).most_common(5)
print("Les comunitats més grans són: {}".format(comunitats_mes_grans))

n = G.number_of_nodes()
a = {k: str(round(float(v/n*100),2))+"%" for k, v in comunitats_mes_grans}
print("Les comunitats més grans tenen les següents proporcions: {}".format(a))

print("La modularitat és: " + str(mod))

degrees = [d for _, d in G.degree()]

# Mostrem estadístiques sobre els graus.
print('El grau màxim és: {}'.format(max(degrees)))
#print('El grau mínim és: {}'.format(min(degrees)))
print('La mitjana dels graus del graf és: {}'.format(np.mean(degrees)))
#print('La mediana dels graus del graf és: {}'.format(np.median(degrees)))

#Analitzem també valors importants per a valorar quins són els nodes amb més centralitat
#Especialment importants són els usuaris amb més grau de sortida, ja que seran analitzats posteriorment per a detectar possibles bots o comportaments extranys
centralitat_grau = nx.degree_centrality(G)
sorted_g = sorted(centralitat_grau.items(), key=lambda i: i[1], reverse=True)[:20]
print("Usuaris amb més Centralitat de grau:")
print(sorted_g)

indeg = G.in_degree(weight='weight')
sorted_indeg = sorted(indeg, key=lambda i: i[1], reverse=True)[:20]
print("Usuaris amb més grau d'ENTRADA:")
print(sorted_indeg)

outdeg = G.out_degree(weight='weight')
sorted_outdeg = sorted(outdeg, key=lambda i: i[1], reverse=True)[:50]
print("Usuaris amb més grau de SORTIDA:")
print(sorted_outdeg)