In [None]:
import tweepy
import json
from pymongo import MongoClient
from collections import Counter
from collections.abc import MutableMapping
import logging
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx
import community
import seaborn as sns
from fastprogress import master_bar, progress_bar
import time
import re


#Logger
logging.basicConfig(filename='Anàlisi-GetBotsByCommunity.log', format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
logging.getLogger().addHandler(ch)

In [None]:
##############
# PARAMETERS #
##############

client = MongoClient(username='XXX', password='XXX')

DatabaseName = "Hashtags"
CollectionName = "Users"

db = client[DatabaseName]

In [None]:
#################
# CREATE BOT DF #
#################

def load_users(user_collection):
    """Extracts the ObjectID and botscore of users
    
    Keyword arguments:
    user_collection -- MongoDB Users' Collection
    """
    pipeline = [
                {
                    '$match': {
                        'botometer_score': {
                            '$exists': True
                        },
                    }
                }, {
                    '$project': {
                        '_id': True, 
                        'botometer_score.cap.universal': True
                    }
                }
            ]
    print("Query", end=" ")
    users = user_collection.aggregate(pipeline, allowDiskUse=True)
    print("OK; List", end=" ")
    users = list(users)
    print("OK; Total users:", len(users))
    return users

def flatten(d, parent_key='', sep='_'):
    """Formats MongoDB results
    
    Keyword arguments:
    d -- dictionary with key and uncleaned values
    parent_key --
    sep --
    """
    items = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, MutableMapping):
            items.extend(flatten(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

users = load_users(db[CollectionName])
users = [flatten(u) for u in users]
df_users = pd.DataFrame(users)
df_users.columns = ['uid', 'universal']
df_users = df_users[df_users['universal'].notna()]
print("OK; Cleaning Error Botscores. Total users:", len(df_users))

p75 = np.percentile(df_users.universal,75)
p95 = np.percentile(df_users.universal,95)
print("Limits [p75, p95]: [" + str(p75) + ", "+ str(p95) + "]")

print('Humans:',len(df_users[df_users.universal<=p75]))
print('Bots:',len(df_users[df_users.universal>=p95]))

In [None]:
####################
# ASSIGN BOT LABEL #
####################

for user in progress_bar(users):
    #print(user)
    label = "UNKNOWN"
    if('botometer_score_cap_universal' in user):
        if(user['botometer_score_cap_universal']<=p75):
            label = "HUMAN"
        elif(user['botometer_score_cap_universal']>=p95):
            label = "BOT"
        else:
            label = "UNDETERMINED"
            
    try:
        db[CollectionName].update_one(
                                    {'_id': user['_id']},
                                    {'$set': {'IS_BOT': label}},
                                    upsert=False,
                                  )
    except Exception as e:
        logging.error(e)
        logging.error("Fatal exception inserting users in MongoDB")

In [None]:
##########################
# GET BOT % BY COMMUNITY #
##########################

def load_communities(collection):
    """Extracts the communities and bots per community
    
    Keyword arguments:
    user_collection -- MongoDB Users' Collection
    """
    pipeline = [
                {
                    '$match': {
                        'community': {
                            '$exists': True
                        },
                    }
                }, {
                    '$project': {
                        'community': True,
                        'IS_BOT': True
                    }
                }, {
                    '$group': {
                        '_id': {'community':'$community', 'IS_BOT':'$IS_BOT'}, 
                        'count': {
                            '$sum': 1
                        }
                    }
                }
            ]
    print("Query", end=" ")
    communities = collection.aggregate(pipeline, allowDiskUse=True)
    print("OK; List", end=" ")
    communities = list(communities)
    print("OK; Total combinations:", len(communities))
    return communities

communities = load_communities(db[CollectionName])
communities_df = pd.DataFrame(communities)

communities_df2 = pd.concat([pd.json_normalize(communities_df['_id']), communities_df['count']], axis=1)

totalUsers = communities_df2['count'].sum()

print("TOTAL USERS: " + str(totalUsers))
#print(communities_df2.sort_values('community'))

communities_df3 = communities_df2.groupby(['community', 'IS_BOT']).agg({'count': 'sum'})
communities_df4 = communities_df3.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))

print(communities_df4.sort_values('community'))

In [None]:
#####################################
# GET MATRICULA BOTS % BY COMMUNITY #
#####################################

def load_communities(collection):
    """Extracts the communities and matricula per community
    
    Keyword arguments:
    user_collection -- MongoDB Users' Collection
    """
    pipeline = [
                {
                    '$match': {
                        'community': {
                            '$exists': True
                        },
                        'IS_BOT': 'BOT'
                    }
                }, {
                    '$project': {
                        'community': True,
                        'IS_MATRICULA': True
                    }
                }, {
                    '$group': {
                        '_id': {'community':'$community', 'IS_MATRICULA':'$IS_MATRICULA'}, 
                        'count': {
                            '$sum': 1
                        }
                    }
                }
            ]
    print("Query", end=" ")
    communities = collection.aggregate(pipeline, allowDiskUse=True)
    print("OK; List", end=" ")
    communities = list(communities)
    print("OK; Total combinations:", len(communities))
    return communities

communities = load_communities(db[CollectionName])
communities_df = pd.DataFrame(communities)

communities_df2 = pd.concat([pd.json_normalize(communities_df['_id']), communities_df['count']], axis=1)

totalUsers = communities_df2['count'].sum()

print("TOTAL USERS: " + str(totalUsers))
#print(communities_df2.sort_values('community'))

communities_df3 = communities_df2.groupby(['community', 'IS_MATRICULA']).agg({'count': 'sum'})
communities_df4 = communities_df3.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))

#print(communities_df3.sort_values('community'))
print(communities_df4.sort_values('community'))

In [None]:
#######################################
# GET MATRICULA BOTS BY YEAR CREATION #
#######################################

def load_communities(collection):
    """Extracts the year creation and matricula
    
    Keyword arguments:
    user_collection -- MongoDB Users' Collection
    """
    pipeline = [
                {
                    '$match': {
                        'CREATION_YEAR': {
                            '$exists': True
                        },
                        'IS_BOT': 'BOT'
                    }
                }, {
                    '$project': {
                        'CREATION_YEAR': True,
                        'IS_MATRICULA': True
                    }
                }, {
                    '$group': {
                        '_id': {'CREATION_YEAR':'$CREATION_YEAR', 'IS_MATRICULA':'$IS_MATRICULA'}, 
                        'count': {
                            '$sum': 1
                        }
                    }
                }
            ]
    print("Query", end=" ")
    communities = collection.aggregate(pipeline, allowDiskUse=True)
    print("OK; List", end=" ")
    communities = list(communities)
    print("OK; Total combinations:", len(communities))
    return communities

communities = load_communities(db[CollectionName])
communities_df = pd.DataFrame(communities)

communities_df2 = pd.concat([pd.json_normalize(communities_df['_id']), communities_df['count']], axis=1)

totalUsers = communities_df2['count'].sum()

print("TOTAL USERS: " + str(totalUsers))
#print(communities_df2.sort_values('community'))

communities_df3 = communities_df2.groupby(['CREATION_YEAR', 'IS_MATRICULA']).agg({'count': 'sum'})
#communities_df4 = communities_df3.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))

print(communities_df3.sort_values('CREATION_YEAR'))

In [None]:
###################################################
# GET MATRICULA BOTS YEAR CREATION % BY COMMUNITY #
###################################################

def load_matricula(collection):
    """Extracts the bot users with their community and year creation 
    
    Keyword arguments:
    user_collection -- MongoDB Users' Collection
    """
    pipeline = [
                {
                    '$match': {
                        'IS_MATRICULA': 'SI',
                        'IS_BOT': 'BOT'
                    }
                }, {
                    '$project': {
                        'community': True,
                        'CREATION_YEAR': True
                    }
                }, {
                    '$group': {
                        '_id': {'community':'$community', 'CREATION_YEAR':'$CREATION_YEAR'}, 
                        'count': {
                            '$sum': 1
                        }
                    }
                }
            ]
    print("Query", end=" ")
    bots = collection.aggregate(pipeline, allowDiskUse=True)
    print("OK; List", end=" ")
    bots = list(bots)
    print("OK; Total combinations:", len(bots))
    return bots

bots = load_matricula(db[CollectionName])
bots_df = pd.DataFrame(bots)

bots_df2 = pd.concat([pd.json_normalize(bots_df['_id']), bots_df['count']], axis=1)

totalBots = bots_df2['count'].sum()

print("TOTAL USERS: " + str(totalBots))
#print(communities_df2.sort_values('community'))

bots_df3 = bots_df2.groupby(['community', 'CREATION_YEAR']).agg({'count': 'sum'})
#bots_df4 = bots_df3.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))

#print(bots_df4.sort_values(['community', 'CREATION_YEAR'], ascending=[True, False]).to_string())
print(bots_df3.sort_values(['count'], ascending=[False]).to_string())

In [None]:
##########################################
# GET BOTS FOLLOWER LABEL % BY COMMUNITY #
##########################################

def load_communities(collection):
    """Extracts the communities and follower label per community
    
    Keyword arguments:
    user_collection -- MongoDB Users' Collection
    """
    pipeline = [
                {
                    '$match': {
                        'community': {
                            '$exists': True
                        },
                        'IS_BOT': 'BOT'
                    }
                }, {
                    '$project': {
                        'community': True,
                        'FOLLOWERS_LABEL': True
                    }
                }, {
                    '$group': {
                        '_id': {'community':'$community', 'FOLLOWERS_LABEL':'$FOLLOWERS_LABEL'}, 
                        'count': {
                            '$sum': 1
                        }
                    }
                }
            ]
    print("Query", end=" ")
    communities = collection.aggregate(pipeline, allowDiskUse=True)
    print("OK; List", end=" ")
    communities = list(communities)
    print("OK; Total combinations:", len(communities))
    return communities

communities = load_communities(db[CollectionName])
communities_df = pd.DataFrame(communities)

communities_df2 = pd.concat([pd.json_normalize(communities_df['_id']), communities_df['count']], axis=1)

totalUsers = communities_df2['count'].sum()

print("TOTAL USERS: " + str(totalUsers))
#print(communities_df2.sort_values('community'))

communities_df3 = communities_df2.groupby(['community', 'FOLLOWERS_LABEL']).agg({'count': 'sum'})
#communities_df4 = communities_df3.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))

print(communities_df3.sort_values('community'))