In [None]:
# data handeling
import pandas as pd
import time
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from wordcloud import WordCloud, STOPWORDS 

# tweepy stuff
import tweepy
from AppCred import BEARER_TOKEN, CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET

import requests
import json
from os.path import exists
import itertools

In [None]:
# assign the values accordingly
consumer_key = CONSUMER_KEY
consumer_secret = CONSUMER_SECRET
access_token = ACCESS_TOKEN
access_token_secret = ACCESS_TOKEN_SECRET
  
# authorization of consumer key and consumer secret
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
  
# set access to user's access key and access secret 
auth.set_access_token(access_token, access_token_secret)
  
# calling the api 
api = tweepy.API(auth, wait_on_rate_limit= True)

# calling the client
client = tweepy.Client(bearer_token = BEARER_TOKEN,
                        consumer_key = CONSUMER_KEY,
                        consumer_secret = CONSUMER_SECRET,
                        access_token = ACCESS_TOKEN,
                        access_token_secret = ACCESS_TOKEN_SECRET,
                        #return_type=dict, 
                        wait_on_rate_limit=True) 

In [None]:
def paginate(iterable, page_size):
    while True:
        i1, i2 = itertools.tee(iterable)
        iterable, page = (itertools.islice(i1, page_size, None),
                list(itertools.islice(i2, page_size)))
        if len(page) == 0:
            break
        yield page


In [None]:
# reading in the data set
d_people = pd.read_excel('immersion.xlsx', sheet_name='People', index_col=0)

d_departments = pd.read_excel('immersion.xlsx', sheet_name='Institutes', index_col=0)

screen_names = list(d_people['Twitter_handle']) + list(d_departments['Twitter_handle'])

print(sum([i != '-' for i in screen_names]))


# Scraping Following

In [None]:
if exists('d_following.csv'):
    d_following = pd.read_csv('d_following.csv', index_col=0)
else:
    d_following = pd.DataFrame({'account': [], 'following' : []})
    d_following.to_csv('d_following.csv')

restricted_following = []

for i, screen_name in enumerate(screen_names):
    if screen_name != '-':
        if screen_name not in list(d_following.account):

            name = screen_name.split("@")[1]
            print(f'Scraping information from {name}: {i} of {len(screen_names)}')

            try:
                following_ids = api.get_friend_ids(screen_name=name, count = 5000)

                following_names = []

                for page in paginate(following_ids, 100):
                    try:
                        results = api.lookup_users(user_id=page)
                        for result in results:
                            following_names.append(result.screen_name)
                    except:
                        pass

                account = [screen_name] * len(following_names)

                d_temp = pd.DataFrame({'account': account, 'following' : following_names})

                d_following = pd.concat([d_following, d_temp],axis=0)

                d_following.to_csv('d_following.csv')
                print('saved')

            except:
                restricted_following.append(screen_name)

## Special treatment for club 5000

In [None]:
if exists('d_following5000.csv'):
    d_following5000 = pd.read_csv('d_following5000.csv', index_col=0)
else:
    d_following5000 = pd.DataFrame({'account': [], 'following' : []})
    d_following5000.to_csv('d_following5000.csv')

restricted_followers = []
# [' @rebadlernissen', ' @tiagopeixoto', ' @Jan_Vogler']
for i, screen_name in enumerate([' @rebadlernissen', ' @tiagopeixoto', ' @Jan_Vogler']):
    if screen_name != '-':
        if screen_name not in list(d_following5000.account):
            print(f'Scraping information from {screen_name.split("@")[1]}: {i+1} of {3}')
            ids = []

            for fid in tweepy.Cursor(api.get_friend_ids, screen_name=screen_name.split("@")[1], count=100).pages():
                ids.append(fid)

            l_followers = []
            for id in ids:
                    try:
                        results = api.lookup_users(user_id=id)
                        for result in results:
                            following_names.append(result.screen_name)
                    except:
                        pass
            account = [screen_name] * len(l_followers)
            d_temp = pd.DataFrame({'account': account, 'following' : l_followers})
            d_following5000 = pd.concat([d_following5000, d_temp],axis=0)
            d_following5000.to_csv('d_following5000.csv')
            print('saved')

In [None]:
c5000 = [' @rebadlernissen', ' @tiagopeixoto', ' @Jan_Vogler']

pd.concat([d_following[~d_following.account.isin(c5000)], d_following5000],axis=0).to_csv('d_following_all.csv')

# Scraping Followers

In [None]:
if exists('d_followers.csv'):
    d_followers = pd.read_csv('d_followers.csv', index_col=0)
else:
    d_followers = pd.DataFrame({'account': [], 'followers' : []})
    d_followers.to_csv('d_followers.csv')

restricted_followers = []

for i, screen_name in enumerate(screen_names):
    if screen_name != '-':
        if screen_name not in list(d_followers.account):

            name = screen_name.split("@")[1]
            print(f'Scraping information from {name}: {i} of {len(screen_names)}')

            try:
                follower_ids = api.get_follower_ids(screen_name=name, count = 5000)

                follower_names = []

                for page in paginate(follower_ids, 100):
                    try:
                        results = api.lookup_users(user_id=page)
                        for result in results:
                            follower_names.append(result.screen_name)
                    except:
                        pass

                account = [screen_name] * len(follower_names)

                d_temp = pd.DataFrame({'account': account, 'followers' : follower_names})

                d_followers = pd.concat([d_followers, d_temp],axis=0)

                d_followers.to_csv('d_followers.csv')
                print('saved')
                #time.sleep(60*2.5)

            except:
                restricted_followers.append(screen_name)

In [None]:
d_followers.head()

In [None]:
restricted_followers

In [None]:
d_tweets = pd.DataFrame()
restricted_tweets = []

for i, screen_name in enumerate(screen_names):
    #time.sleep(60*2.5)
    if screen_name != '-':
        print(f'Scraping information from {screen_name}: {i} of {len(screen_names)}')

        try:
            # scraping tweets
            paginator = tweepy.Paginator(
                client.get_users_tweets,                                # The method we want to call 
                client.get_user(username = screen_name.split('@')[1])[0]['id'],   # Arguments passed to the method - the search query
                expansions=['author_id', 'in_reply_to_user_id'], ### get reply from this - add below
                tweet_fields=["public_metrics", "created_at", 'geo', 'context_annotations', 'referenced_tweets'], 
                user_fields=['username', 'location'],
                max_results=100                                         # Arguments passed to the method - how many tweets per page
                #limit=20                                               # Argument passed to the paginator - how many pages to retrieve
                )

            d_tweet = pd.DataFrame()

            for tweet in paginator:
                data = tweet.data
                df_meta = pd.DataFrame(data)

                for i in range(len(df_meta)):
                    if df_meta.text[i][0:2] == 'RT':
                        rt = df_meta.text[i].split(':',1)
                        rt[1] = api.get_status(id=df_meta.id[i], tweet_mode = 'extended')._json['retweeted_status']['full_text']
                        df_meta.text[i] = ': '.join(rt)

                df_public_metrics = pd.DataFrame()
                    # extracting more public metrics (likes, retweets, etc.), which is stored in dictionaries
                for public_metric in df_meta.public_metrics:
                        # storing the public metrics
                    df_public_metrics = pd.concat([df_public_metrics, pd.DataFrame([public_metric])] ,ignore_index=True)
                    # collecting the text and the public metrics
                df_meta = pd.concat([df_meta.text, df_meta.id, df_meta.created_at, df_public_metrics], axis=1)
                    # saving the creator of the tweets
                df_meta = df_meta.assign(account = 'BlokAnders')
                    # storing the information from each paginator
                d_tweet = pd.concat([d_tweet, df_meta], ignore_index=True)
                # storing the information from each account
            d_tweets = pd.concat([d_tweets, d_tweet], ignore_index=True)
        except:
            restricted_tweets.append(screen_name)

# sleeps after 

In [None]:
#d_tweets.to_csv('d_tweets.csv')

In [115]:
d_tweets = pd.DataFrame()

paginator = tweepy.Paginator(
    client.get_users_tweets,                                # The method we want to call 
    client.get_user(username = 'BlokAnders')[0]['id'],   # Arguments passed to the method - the search query
    expansions=['author_id', 'in_reply_to_user_id'], ### get reply from this - add below
    tweet_fields=["public_metrics", "created_at", 'geo', 'context_annotations', 'referenced_tweets'], 
    user_fields=['username', 'location'],
    max_results=100                                         # Arguments passed to the method - how many tweets per page
    #limit=20                                               # Argument passed to the paginator - how many pages to retrieve
    )

d_tweet = pd.DataFrame()

for tweet in paginator:
    data = tweet.data
    df_meta = pd.DataFrame(data)

    for i in range(len(df_meta)):
        if df_meta.text[i][0:2] == 'RT':
            rt = df_meta.text[i].split(':',1)
            rt[1] = api.get_status(id=df_meta.id[i], tweet_mode = 'extended')._json['retweeted_status']['full_text']
            df_meta.text[i] = ': '.join(rt)


    df_public_metrics = pd.DataFrame()
        # extracting more public metrics (likes, retweets, etc.), which is stored in dictionaries
    for public_metric in df_meta.public_metrics:
            # storing the public metrics
        df_public_metrics = pd.concat([df_public_metrics, pd.DataFrame([public_metric])] ,ignore_index=True)
        # collecting the text and the public metrics
    df_meta = pd.concat([df_meta.text, df_meta.id, df_meta.created_at, df_public_metrics], axis=1)
        # saving the creator of the tweets
    df_meta = df_meta.assign(account = 'BlokAnders')
        # storing the information from each paginator
    d_tweet = pd.concat([d_tweet, df_meta], ignore_index=True)
    # storing the information from each account
d_tweets = pd.concat([d_tweets, d_tweet], ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_meta.text[i] = ': '.join(rt)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_meta.text[i] = ': '.join(rt)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_meta.text[i] = ': '.join(rt)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_meta.text[i] = ': '.join(rt)
A value is trying to be set on a copy of

In [None]:
# old version

#if exists('d_followers.csv'):
#    d_followers = pd.read_csv('d_followers.csv', index_col=0)
#else:
#    d_followers = pd.DataFrame({'account': [], 'followers' : [], 'locations' : []})
#    d_followers.to_csv('d_followers.csv')
#
#restricted_followers = []
#
#for i, screen_name in enumerate(screen_names):
#    if screen_name != '-':
#        if screen_name not in list(d_followers.account):
#            print(f'Scraping information from {screen_name.split("@")[1]}: {i} of {len(screen_names)}')
#
#            try:
#                ids = []
#
#                for fid in tweepy.Cursor(api.get_followers, screen_name=screen_name.split("@")[1], count=100).pages():
#                    ids.append(fid)
#
#                l_followers = []
#                l_locations = []
#
#                for id in ids:
#                    for i in id:
#                        user_info = jsonify_tweepy(i)
#                        l_followers.append(user_info['screen_name'])
#                        l_locations.append(user_info['location'])
#                account = [screen_name] * len(l_followers)
#
#                d_temp = pd.DataFrame({'account': account, 'followers' : l_followers, 'locations' : l_locations})
#
#                d_followers = pd.concat([d_followers, d_temp],axis=0)
#
#                d_followers.to_csv('d_followers.csv')
#                print('saved')
#                #time.sleep(60*2.5)
#
#            except:
#                restricted_followers.append(screen_name)