In [None]:
import tweepy
import json
from pymongo import MongoClient
from collections import Counter
import logging
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx
import community
import seaborn as sns
from fastprogress import master_bar, progress_bar
import time
import re

#Logger
logging.basicConfig(filename='Anàlisi-GetUsers.log', format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
logging.getLogger().addHandler(ch)

In [None]:
##############
# PARAMETERS #
##############

client = MongoClient(username='XXX', password='XXX')

DatabaseName = "Streaming"
CollectionName = "Hashtags"

In [None]:
###########################################
# GET ACTIVE USERS FROM TWEET COLLECTION  #
###########################################

def get_active_users(tweet_collection):
    """
    Extracts the Active (that have created tweets) user objects 
    
    Keyword arguments:
    tweet_collection -- MongoDB Tweets' Collection  
    """    
    
    pipeline = [
    { 
        '$project': {
            'user':1
        } 
    }, 
    {
        '$group': {
            '_id': '$user.id',
            'screen_name': {
                '$last': '$user.screen_name'
            },
            'name': {
                '$last': '$user.name'
            },
            'verified': {
                '$first': '$user.verified'
            },
            'followers_count': {
                '$last': '$user.followers_count'
            },
            'friends_count': {
                '$last': '$user.friends_count'
            },
            'created_at': {
                '$first': '$user.created_at'
            },
            'location': {
                '$last': '$user.location'
            },
            'description': {
                '$last': '$user.description'
            }
        }
    }      
    ]
    
    users = list(tweet_collection.aggregate(pipeline,allowDiskUse=True))
    print(len(users),'users with tweets extracted!')
    return users

# get a list of users from tweet collection (authorships)
db = client[DatabaseName]
active_users = get_active_users(db[CollectionName])

In [None]:
###############################
# INSERT USERS TO MONGO DB #
###############################

new_collection = db['Previous-Users']

for user in progress_bar(active_users):
    try:
        #new_collection.insert_one(user)
        new_collection.update_one(
                                    {'_id': user['_id']},
                                    {'$setOnInsert': user},
                                    upsert=True
                                  )
    except Exception as e:
        logging.error(e)
        logging.error("Fatal exception inserting users in MongoDB")

In [None]:
############################################
# GET PASSIVE USERS FROM TWEET COLLECTION  #
############################################

def get_passive_users_RT(tweet_collection):

    pipeline = [
    {
        '$match': {
            'retweeted_status': {
                '$exists':True
            }
        }
    } ,
    { 
        '$project': {
            'retweeted_status.user':1
        } 
    }, 
    {
        '$group': {
            '_id': '$retweeted_status.user.id',
            'screen_name': {
                '$last': '$retweeted_status.user.screen_name'
            },
            'name': {
                '$last': '$retweeted_status.user.name'
            },
            'verified': {
                '$first': '$retweeted_status.user.verified'
            },
            'followers_count': {
                '$last': '$retweeted_status.user.followers_count'
            },
            'friends_count': {
                '$last': '$retweeted_status.user.friends_count'
            },
            'created_at': {
                '$first': '$retweeted_status.user.created_at'
            },
            'location': {
                '$last': '$retweeted_status.user.location'
            },
            'description': {
                '$last': '$retweeted_status.user.description'
            }
        }
    }      
    ]
    
    users = list(tweet_collection.aggregate(pipeline,allowDiskUse=True))
    print(len(users),'users with RT tweets extracted!')
    return users

def get_passive_users_quote(tweet_collection):

    pipeline = [
    {
        '$match': {
            'quoted_status': {
                '$exists':True
            }
        }
    } ,
    { 
        '$project': {
            'quoted_status.user':1
        } 
    }, 
    {
        '$group': {
            '_id': '$quoted_status.user.id',
            'screen_name': {
                '$last': '$quoted_status.user.screen_name'
            },
            'name': {
                '$last': '$quoted_status.user.name'
            },
            'verified': {
                '$first': '$quoted_status.user.verified'
            },
            'followers_count': {
                '$last': '$quoted_status.user.followers_count'
            },
            'friends_count': {
                '$last': '$quoted_status.user.friends_count'
            },
            'created_at': {
                '$first': '$quoted_status.user.created_at'
            },
            'location': {
                '$last': '$quoted_status.user.location'
            },
            'description': {
                '$last': '$quoted_status.user.description'
            }
        }
    }      
    ]
    
    users = list(tweet_collection.aggregate(pipeline,allowDiskUse=True))
    print(len(users),'users with quote tweets extracted!')
    return users

def get_passive_users_reply(tweet_collection):
 
    pipeline = [
    {
        '$match': {
            'in_reply_to_status_id': {
                "$ne": None
            }
        }
    }  
    ]
    
    users = list(tweet_collection.aggregate(pipeline,allowDiskUse=True))
    print(len(users),'users with reply tweets extracted!')
    return users

passive_users_RT = get_passive_users_RT(db[CollectionName])
passive_users_quote = get_passive_users_quote(db[CollectionName])
passive_users_reply = get_passive_users_reply(db[CollectionName])
passive_users = passive_users_RT + passive_users_quote + passive_users_reply

In [None]:
################################################
# INSERT (IF DONT EXIST YET) USERS TO MONGO DB #
################################################

new_collection = db['Users']

for user in progress_bar(passive_users):
    try:
        new_collection.update_one(
                                    {'_id': user['_id']},
                                    {'$setOnInsert': user},
                                    upsert=True
                                  )
    except Exception as e:
        logging.error(e)
        logging.error("Fatal exception inserting users in MongoDB")