In [None]:
import tweepy
import json
from pymongo import MongoClient
from collections import Counter
from collections.abc import MutableMapping
import logging
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx
import community
import seaborn as sns
from fastprogress import master_bar, progress_bar
import time
import re


#Logger
logging.basicConfig(filename='Analisi-UserStatsByCommunity.log', format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
logging.getLogger().addHandler(ch)

In [None]:
##############
# PARAMETERS #
##############

client = MongoClient(username='XXX', password='XXX')

DatabaseName = "Streaming"
CollectionName = "Users"

db = client[DatabaseName]

In [None]:
##############################
# CREATE CREATION_YEAR LABEL #
##############################

def load_users(user_collection):
    """Extracts the ObjectID and created_at of users
    
    Keyword arguments:
    user_collection -- MongoDB Users' Collection
    """
    pipeline = [
                {
                    '$match': {
                        'created_at': {
                            '$exists': True
                        },
                    }
                }, {
                    '$project': {
                        '_id': True, 
                        'created_at': True
                    }
                }
            ]
    print("Query", end=" ")
    users = user_collection.aggregate(pipeline, allowDiskUse=True)
    print("OK; List", end=" ")
    users = list(users)
    print("OK; Total users:", len(users))
    return users

users = load_users(db[CollectionName])

creation_date = ""
YEAR = ""

for user in progress_bar(users):
    creation_date = pd.to_datetime(user['created_at'], infer_datetime_format = "%Y", utc  = False)
    YEAR = creation_date.year   # we extract the date (year) from timestamp (created_at)
    
    try:
        db[CollectionName].update_one(
                                    {'_id': user['_id']},
                                    {'$set': {'CREATION_YEAR': YEAR}},
                                    upsert=False,
                                  )
    except Exception as e:
        logging.error(e)
        logging.error("Fatal exception inserting users in MongoDB")

In [None]:
####################################
# GET YEAR CREATION % BY COMMUNITY #
####################################

def load_communities(collection):
    """Extracts the communities and year creation per community
    
    Keyword arguments:
    user_collection -- MongoDB Users' Collection
    """
    pipeline = [
                {
                    '$match': {
                        'community': {
                            '$exists': True
                        },
                    }
                }, {
                    '$project': {
                        'community': True,
                        'CREATION_YEAR': True
                    }
                }, {
                    '$group': {
                        '_id': {'community':'$community', 'CREATION_YEAR':'$CREATION_YEAR'}, 
                        'count': {
                            '$sum': 1
                        }
                    }
                }
            ]
    print("Query", end=" ")
    communities = collection.aggregate(pipeline, allowDiskUse=True)
    print("OK; List", end=" ")
    communities = list(communities)
    print("OK; Total combinations:", len(communities))
    return communities

communities = load_communities(db[CollectionName])
communities_df = pd.DataFrame(communities)

communities_df2 = pd.concat([pd.json_normalize(communities_df['_id']), communities_df['count']], axis=1)

totalUsers = communities_df2['count'].sum()

print("TOTAL USERS: " + str(totalUsers))
#print(communities_df2.sort_values('community'))

communities_df3 = communities_df2.groupby(['community', 'CREATION_YEAR']).agg({'count': 'sum'})
communities_df4 = communities_df3.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))

#print(communities_df4.sort_values(['community', 'CREATION_YEAR'], ascending=[True, False]).to_string())
print(communities_df4.sort_values(['count'], ascending=[False]).to_string())

print(communities_df2.loc[communities_df2['CREATION_YEAR'].isin([2021,2022])].sort_values(['CREATION_YEAR', 'count'],  ascending=[False, False]))

In [None]:
########################################
# GET BOT YEAR CREATION % BY COMMUNITY #
########################################

def load_bots(collection):
    """Extracts the bot users with their community and year creation 
    
    Keyword arguments:
    user_collection -- MongoDB Users' Collection
    """
    pipeline = [
                {
                    '$match': {
                        'IS_BOT': 'BOT',
                    }
                }, {
                    '$project': {
                        'community': True,
                        'CREATION_YEAR': True
                    }
                }, {
                    '$group': {
                        '_id': {'community':'$community', 'CREATION_YEAR':'$CREATION_YEAR'}, 
                        'count': {
                            '$sum': 1
                        }
                    }
                }
            ]
    print("Query", end=" ")
    bots = collection.aggregate(pipeline, allowDiskUse=True)
    print("OK; List", end=" ")
    bots = list(bots)
    print("OK; Total combinations:", len(bots))
    return bots

bots = load_bots(db[CollectionName])
bots_df = pd.DataFrame(bots)

bots_df2 = pd.concat([pd.json_normalize(bots_df['_id']), bots_df['count']], axis=1)

totalBots = bots_df2['count'].sum()

print("TOTAL USERS: " + str(totalBots))
#print(communities_df2.sort_values('community'))

bots_df3 = bots_df2.groupby(['community', 'CREATION_YEAR']).agg({'count': 'sum'})
bots_df4 = bots_df3.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))

#print(bots_df4.sort_values(['community', 'CREATION_YEAR'], ascending=[True, False]).to_string())
print(bots_df4.sort_values(['count'], ascending=[False]).to_string())

print(bots_df2.loc[bots_df2['CREATION_YEAR'].isin([2020,2021])].sort_values(['CREATION_YEAR', 'count'],  ascending=[False, False]))

In [None]:
##################################
# CREATE CUENTAS MATRÍCULA LABEL #
##################################

def is_Matricula (userName):
    if(len(userName)>8):
        for char in userName[-8:]:
            if(not char.isdigit()):
                return False
        #print(userName)
        return True
    return False

def load_users(user_collection):
    """Extracts the ObjectID and created_at of users
    
    Keyword arguments:
    user_collection -- MongoDB Users' Collection
    """
    pipeline = [
                {
                    '$match': {
                        'screen_name': {
                            '$exists': True
                        },
                    }
                }, {
                    '$project': {
                        '_id': True, 
                        'screen_name': True
                    }
                }
            ]
    print("Query", end=" ")
    users = user_collection.aggregate(pipeline, allowDiskUse=True)
    print("OK; List", end=" ")
    users = list(users)
    print("OK; Total users:", len(users))
    return users

users = load_users(db[CollectionName])

IS_MATRICULA = "NO"

for user in progress_bar(users):
    #print(user)
    IS_MATRICULA = "NO"

    if(is_Matricula(user['screen_name'])):
        IS_MATRICULA = "SI"
    
    try:
        db[CollectionName].update_one(
                                    {'_id': user['_id']},
                                    {'$set': {'IS_MATRICULA': IS_MATRICULA}},
                                    upsert=False,
                                  )
    except Exception as e:
        logging.error(e)
        logging.error("Fatal exception inserting users in MongoDB")

In [None]:
################################
# GET MATRICULA % BY COMMUNITY #
################################

def load_communities(collection):
    """Extracts the communities and matricula per community
    
    Keyword arguments:
    user_collection -- MongoDB Users' Collection
    """
    pipeline = [
                {
                    '$match': {
                        'community': {
                            '$exists': True
                        },
                    }
                }, {
                    '$project': {
                        'community': True,
                        'IS_MATRICULA': True
                    }
                }, {
                    '$group': {
                        '_id': {'community':'$community', 'IS_MATRICULA':'$IS_MATRICULA'}, 
                        'count': {
                            '$sum': 1
                        }
                    }
                }
            ]
    print("Query", end=" ")
    communities = collection.aggregate(pipeline, allowDiskUse=True)
    print("OK; List", end=" ")
    communities = list(communities)
    print("OK; Total combinations:", len(communities))
    return communities

communities = load_communities(db[CollectionName])
communities_df = pd.DataFrame(communities)

communities_df2 = pd.concat([pd.json_normalize(communities_df['_id']), communities_df['count']], axis=1)

totalUsers = communities_df2['count'].sum()

print("TOTAL USERS: " + str(totalUsers))
#print(communities_df2.sort_values('community'))

communities_df3 = communities_df2.groupby(['community', 'IS_MATRICULA']).agg({'count': 'sum'})
communities_df4 = communities_df3.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))

print(communities_df3.sort_values('community'))
print(communities_df4.sort_values('community'))

In [None]:
##################################
# GET MATRICULA BY YEAR CREATION #
##################################

def load_communities(collection):
    """Extracts the year creation and matricula
    
    Keyword arguments:
    user_collection -- MongoDB Users' Collection
    """
    pipeline = [
                {
                    '$match': {
                        'CREATION_YEAR': {
                            '$exists': True
                        },
                    }
                }, {
                    '$project': {
                        'CREATION_YEAR': True,
                        'IS_MATRICULA': True
                    }
                }, {
                    '$group': {
                        '_id': {'CREATION_YEAR':'$CREATION_YEAR', 'IS_MATRICULA':'$IS_MATRICULA'}, 
                        'count': {
                            '$sum': 1
                        }
                    }
                }
            ]
    print("Query", end=" ")
    communities = collection.aggregate(pipeline, allowDiskUse=True)
    print("OK; List", end=" ")
    communities = list(communities)
    print("OK; Total combinations:", len(communities))
    return communities

communities = load_communities(db[CollectionName])
communities_df = pd.DataFrame(communities)

communities_df2 = pd.concat([pd.json_normalize(communities_df['_id']), communities_df['count']], axis=1)

totalUsers = communities_df2['count'].sum()

print("TOTAL USERS: " + str(totalUsers))
#print(communities_df2.sort_values('community'))

communities_df3 = communities_df2.groupby(['CREATION_YEAR', 'IS_MATRICULA']).agg({'count': 'sum'})
#communities_df4 = communities_df3.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))

print(communities_df3.sort_values('CREATION_YEAR'))

In [None]:
##############################################
# GET MATRICULA YEAR CREATION % BY COMMUNITY #
##############################################

def load_matricula(collection):
    """Extracts the bot users with their community and year creation 
    
    Keyword arguments:
    user_collection -- MongoDB Users' Collection
    """
    pipeline = [
                {
                    '$match': {
                        'IS_MATRICULA': 'SI',
                    }
                }, {
                    '$project': {
                        'community': True,
                        'CREATION_YEAR': True
                    }
                }, {
                    '$group': {
                        '_id': {'community':'$community', 'CREATION_YEAR':'$CREATION_YEAR'}, 
                        'count': {
                            '$sum': 1
                        }
                    }
                }
            ]
    print("Query", end=" ")
    bots = collection.aggregate(pipeline, allowDiskUse=True)
    print("OK; List", end=" ")
    bots = list(bots)
    print("OK; Total combinations:", len(bots))
    return bots

bots = load_matricula(db[CollectionName])
bots_df = pd.DataFrame(bots)

bots_df2 = pd.concat([pd.json_normalize(bots_df['_id']), bots_df['count']], axis=1)

totalBots = bots_df2['count'].sum()

print("TOTAL USERS: " + str(totalBots))
#print(communities_df2.sort_values('community'))

bots_df3 = bots_df2.groupby(['community', 'CREATION_YEAR']).agg({'count': 'sum'})
#bots_df4 = bots_df3.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))

#print(bots_df4.sort_values(['community', 'CREATION_YEAR'], ascending=[True, False]).to_string())
print(bots_df3.sort_values(['count'], ascending=[False]).to_string())

In [None]:
################################
# GET LOCATION % BY COMMUNITY #
################################

def load_communities(collection):
    """Extracts the communities and location per community
    
    Keyword arguments:
    user_collection -- MongoDB Users' Collection
    """
    pipeline = [
                {
                    '$match': {
                        'community': {
                            '$exists': True
                        },
                    }
                }, {
                    '$project': {
                        'community': True,
                        'location': True
                    }
                }, {
                    '$group': {
                        '_id': {'community':'$community', 'location':'$location'}, 
                        'count': {
                            '$sum': 1
                        }
                    }
                }
            ]
    print("Query", end=" ")
    communities = collection.aggregate(pipeline, allowDiskUse=True)
    print("OK; List", end=" ")
    communities = list(communities)
    print("OK; Total combinations:", len(communities))
    return communities

communities = load_communities(db[CollectionName])
communities_df = pd.DataFrame(communities)

communities_df2 = pd.concat([pd.json_normalize(communities_df['_id']), communities_df['count']], axis=1)

totalUsers = communities_df2['count'].sum()

print("TOTAL USERS: " + str(totalUsers))
#print(communities_df2.sort_values('community'))

communities_df3 = communities_df2.groupby(['community', 'location']).agg({'count': 'sum'})
#communities_df4 = communities_df3.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))

print(communities_df3.sort_values('count', ascending=False).head(50).to_string())

In [None]:
##################################
# CREATE FOLLOWERS QUALITY LABEL #
##################################

def load_users(user_collection):
    """Extracts the ObjectID and created_at of users
    
    Keyword arguments:
    user_collection -- MongoDB Users' Collection
    """
    pipeline = [
                {
                    '$match': {
                        'followers_count': {
                            '$exists': True
                        },
                    }
                }, {
                    '$project': {
                        '_id': True, 
                        'followers_count': True
                    }
                }
            ]
    print("Query", end=" ")
    users = user_collection.aggregate(pipeline, allowDiskUse=True)
    print("OK; List", end=" ")
    users = list(users)
    print("OK; Total users:", len(users))
    return users

users = load_users(db[CollectionName])

FOLLOWERS_LABEL = "LOW"

for user in progress_bar(users):
    #print(user)
    FOLLOWERS_LABEL = "LOW"

    if(user['followers_count']>99):
        FOLLOWERS_LABEL = "HIGH"
    elif(user['followers_count']>9):
        FOLLOWERS_LABEL = "MED"
    
    try:
        db[CollectionName].update_one(
                                    {'_id': user['_id']},
                                    {'$set': {'FOLLOWERS_LABEL': FOLLOWERS_LABEL}},
                                    upsert=False,
                                  )
    except Exception as e:
        logging.error(e)
        logging.error("Fatal exception inserting users in MongoDB")

In [None]:
################################
# GET FOLLOWER LABEL % BY COMMUNITY #
################################

def load_communities(collection):
    """Extracts the communities and follower label per community
    
    Keyword arguments:
    user_collection -- MongoDB Users' Collection
    """
    pipeline = [
                {
                    '$match': {
                        'community': {
                            '$exists': True
                        },
                    }
                }, {
                    '$project': {
                        'community': True,
                        'FOLLOWERS_LABEL': True
                    }
                }, {
                    '$group': {
                        '_id': {'community':'$community', 'FOLLOWERS_LABEL':'$FOLLOWERS_LABEL'}, 
                        'count': {
                            '$sum': 1
                        }
                    }
                }
            ]
    print("Query", end=" ")
    communities = collection.aggregate(pipeline, allowDiskUse=True)
    print("OK; List", end=" ")
    communities = list(communities)
    print("OK; Total combinations:", len(communities))
    return communities

communities = load_communities(db[CollectionName])
communities_df = pd.DataFrame(communities)

communities_df2 = pd.concat([pd.json_normalize(communities_df['_id']), communities_df['count']], axis=1)

totalUsers = communities_df2['count'].sum()

print("TOTAL USERS: " + str(totalUsers))
#print(communities_df2.sort_values('community'))

communities_df3 = communities_df2.groupby(['community', 'FOLLOWERS_LABEL']).agg({'count': 'sum'})
#communities_df4 = communities_df3.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))

print(communities_df3.sort_values('community'))

In [None]:
######################
# USERS BY COMMUNITY #
######################

def load_communities(collection):
    """Extracts the communities info
    
    Keyword arguments:
    user_collection -- MongoDB Users' Collection
    """
    pipeline = [
                {
                    '$match': {
                        'community': {
                            '$exists': True
                        },
                    }
                }, {
                    '$project': {
                        'community': True
                    }
                }, {
                    '$group': {
                        '_id': {'community':'$community'}, 
                        'count': {
                            '$sum': 1
                        }
                    }
                }
            ]
    print("Query", end=" ")
    communities = collection.aggregate(pipeline, allowDiskUse=True)
    print("OK; List", end=" ")
    communities = list(communities)
    print("OK; Total combinations:", len(communities))
    return communities

communities = load_communities(db[CollectionName])
communities_df = pd.DataFrame(communities)

communities_df2 = pd.concat([pd.json_normalize(communities_df['_id']), communities_df['count']], axis=1)

totalUsers = communities_df2['count'].sum()

print("TOTAL USERS: " + str(totalUsers))
#print(communities_df2.sort_values('community'))

communities_df3 = communities_df2.groupby(['community']).agg({'count': 'sum'})
#communities_df4 = communities_df3.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))

print(communities_df3.sort_values('count', ascending=False))

In [None]:
#############################
# INTERACTIONS BY COMMUNITY #
#############################

CollectionName = "Campanya-Interactions"

def load_communities(collection):
    """Extracts the communities and follower label per community
    
    Keyword arguments:
    user_collection -- MongoDB Users' Collection
    """
    pipeline = [
                {
                    '$match': {
                        'ACTIVE_community': {
                            '$exists': True
                        },
                    }
                }, {
                    '$project': {
                        'ACTIVE_community': True
                    }
                }, {
                    '$group': {
                        '_id': {'ACTIVE_community':'$ACTIVE_community'}, 
                        'count': {
                            '$sum': 1
                        }
                    }
                }
            ]
    print("Query", end=" ")
    communities = collection.aggregate(pipeline, allowDiskUse=True)
    print("OK; List", end=" ")
    communities = list(communities)
    print("OK; Total combinations:", len(communities))
    return communities

communities = load_communities(db[CollectionName])
communities_df = pd.DataFrame(communities)

communities_df2 = pd.concat([pd.json_normalize(communities_df['_id']), communities_df['count']], axis=1)

totalUsers = communities_df2['count'].sum()

print("TOTAL USERS: " + str(totalUsers))
#print(communities_df2.sort_values('community'))

communities_df3 = communities_df2.groupby(['ACTIVE_community']).agg({'count': 'sum'})
#communities_df4 = communities_df3.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))

print(communities_df3.sort_values('count', ascending=False))