In [None]:
import tweepy
import time
import os
import json
import pandas as pd
import numpy as np
import csv
from tqdm import tqdm
import pickle
from textblob import TextBlob
from langdetect import detect
import re
from nltk.corpus import stopwords

In [None]:
def get_following_df_from_files(accounts_path):
    
    followings_df = []
    
    accounts = os.listdir(accounts_path)
    
    for account in accounts:
        acc_path = os.path.join(accounts_path, account)
        try:
            df = pd.read_json(os.path.join(acc_path, 'following.json'))
            followings_df.append(df)
        except Exception as ex:
            print(ex)
            continue
    return followings_df

def load_trained_model(path: str):
    loaded_model = pickle.load(open(path, 'rb'))
    return loaded_model

def clean_text(text: str):
    text=re.sub(r'@[A-Za-z0-9]+','',text) ## removing @ mention
    text=re.sub(r'#','',text)             ## removing # symbol
    text=re.sub(r'RT[\s]+','',text)  ## removing RT followed byspace
    text=re.sub(r'https?:\/\/\S+','',text) ## removing https
    return text

def filter_en_accounts(text: str):
    try:
        if text:
            return detect(text) == 'en'
        else:
            return True
    except:
        return True

def filter_already_downloaded_accounts(acc_name: str, all_acc):
    return acc_name not in all_acc
            
def get_tweets_for_all_accounts(api, anti_save_dir, pro_save_dir, neutral_save_dir, accounts):
    
    english_stopwords = stopwords.words("english")
    pretrained_vectorizer = load_trained_model(r'C:\Users\psrub\Documents\Python\Twitter\models\vectorizer_1000_tweets_final_updated_after_1_iteration.h5')
    pretrained_svm = load_trained_model(r'C:\Users\psrub\Documents\Python\Twitter\models\svm_1000_tweets_final_updated_after_1_iteration.h5')
    
    
    for acc in tqdm(accounts):
        
        save_path_anti = os.path.join(anti_save_dir, acc)
        save_path_pro = os.path.join(pro_save_dir, acc)
        save_path_neutral = os.path.join(neutral_save_dir, acc)
        
        if os.path.exists(os.path.join(save_path_anti, 'tweets.json')) or os.path.exists(os.path.join(save_path_pro, 'tweets.json')) \
        or os.path.exists(os.path.join(save_path_neutral, 'tweets.json')):
            print(f'Account: {acc} skipped - already downloaded')
            continue
        
        
        try:
            df = get_account_tweets(api, acc)
            df['filtered_text'] = df['clean_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in english_stopwords]))
            df_acc = df.groupby(['account'], as_index = False).agg({'filtered_text': ' '.join, 'language': pd.Series.mode})
            if df_acc['language'][0] != 'en':
                print(f'Account: {acc} skipped as not english language')
                continue
            tweets_vectorized = pretrained_vectorizer.transform([df_acc['filtered_text'][0]])
            predicted_class = pretrained_svm.predict(tweets_vectorized)[0]

            # anti hubs
            if predicted_class == 0:
                print(f'Account: {acc} was clasified as anti hub')
                save_path = os.path.join(anti_save_dir, acc)
                if not os.path.exists(save_path):
                    os.mkdir(save_path)
                    df.to_json(os.path.join(save_path, 'tweets.json'))
                    
            if predicted_class == 1:
                print(f'Account: {acc} was clasified as neutral hub')
                save_path = os.path.join(neutral_save_dir, acc)
                if not os.path.exists(save_path):
                    os.mkdir(save_path)
                    df.to_json(os.path.join(save_path, 'tweets.json'))

            # pro hubs      
            elif predicted_class == 2:
                print(f'Account: {acc} was clasified as pro hub')
                save_path = os.path.join(pro_save_dir, acc)
                if not os.path.exists(save_path):
                    os.mkdir(save_path)
                    df.to_json(os.path.join(save_path, 'tweets.json'))
                    
                    
        except Exception as ex:
            print(ex)
            continue
            

def get_account_tweets(api, acc_name: str, max_tweets=1000):
    
    try:
        all_tweets = []     
        tweets = api.user_timeline(screen_name=acc_name, count=200, include_rts=True, tweet_mode='extended')
        if tweets:
            all_tweets.extend(tweets)
            oldest_id = tweets[-1].id

            while len(tweets) > 0 and len(all_tweets) <= max_tweets:
                tweets = api.user_timeline(screen_name=acc_name, count=200, include_rts=True, max_id=oldest_id-1, tweet_mode='extended')
                all_tweets.extend(tweets)
                oldest_id = all_tweets[-1].id

        final_tweets = [[tweet.id_str, tweet.created_at, tweet.favorite_count, tweet.retweet_count, tweet.lang, tweet.full_text.encode("utf-8").decode("utf-8")] 
                 for idx, tweet in enumerate(all_tweets)]

        print(f'Downloaded {len(final_tweets)} tweets for account {acc_name}')
        df = pd.DataFrame(final_tweets,columns=["id","created_at","favorite_count","retweet_count", "language", "text"])
        df['clean_text']=df['text'].apply(clean_text)
        df['account'] = acc_name
        return df
    
    except Exception as ex:
        print(ex)
        return pd.DataFrame()
    
    
def itemgetter(*items):
    if len(items) == 1:
        item = items[0]
        def g(obj):
            return obj[item]
    else:
        def g(obj):
            return tuple(obj[item] for item in items)
    return g

def get_following_ids(api, screen_name):
    print('Getting Following ids of', screen_name)
    following_ids = []
    try:
        users_id = tweepy.Cursor(api.friends_ids, screen_name = screen_name, wait_on_rate_limit = True, count = 5000, lang=['en'])
        
        for user_id in users_id.items():
            following_ids.append(user_id)
            
    except tweepy.TweepError as e:
        print(e)
        
    print(f'Fetched number of following ids for {screen_name} : {len(following_ids)}')
    return following_ids

def get_followers_ids(screen_name, max_number):
    print('Getting Followers ids of', screen_name)
    followers_ids = []
    try:
        while len(followers_ids) <= max_number:
            users_id = tweepy.Cursor(api.followers_ids, screen_name = screen_name, wait_on_rate_limit = True, count = 5000, lang=['en'])
            for user_id in users_id.items():
                followers_ids.append(user_id)
                
    except tweepy.TweepError as e:
            print('Going to sleep: ', e)
            time.sleep(60)
            
    print(f'Fetched number of followers ids for {screen_name} : {len(followers_ids)}')
    return followers_ids


def get_following(screen_name):
    print('Getting Following list of ', screen_name)
    following = []
    users = tweepy.Cursor(api.friends, screen_name = screen_name, wait_on_rate_limit = True, count=5000, lang=['en'])
    for user in users.items():
        try:
            following.append(user)
        except tweepy.TweepError as e:
            print("Going to sleep:", e)
            time.sleep(60)
    print(f'Fetched number of followings for {screen_name} : {len(following)}')          
    return following


def get_data(api, save_dir, accounts):
    
    for acc in tqdm(accounts):

        save_path = os.path.join(save_dir, acc)

        if not os.path.exists(save_path):
            os.mkdir(save_path)
            
        # ACCOUNT DATA
        try:
            
            if os.path.exists(os.path.join(save_path, 'account.json')):
                print(f'Account: {acc} skipped - already downloaded')
                continue
            
            acc_data = api.get_user(screen_name = acc, wait_on_rate_limit = True)
            ob = {
                    'ID':acc_data.id,
                    'Screen_Name':acc_data.screen_name,
                    'Description': acc_data.description,
                    'StatusesCount':acc_data.statuses_count,
                    'Follower_Count':acc_data.followers_count,
                    'Following_Count':acc_data.friends_count
                    }
        except Exception as ex:
            print(ex)
            continue

        with open(os.path.join(save_path, 'account.json'), 'w') as f:
            json.dump(ob, f)
            
        if ob['Following_Count'] > 20000:
            print(f"Skipped {acc} because too many followings - {ob['Following_Count']}")
            continue
        
        if os.path.exists(os.path.join(save_path, 'following.json')):
            print(f'Account: {acc} skipped - already downloaded')
            continue
        
        # FOLLOWING DATA
        data = []
        following_ids = get_following_ids(api, acc)
        for i in range(0, len(following_ids), 100):
            try:
                chunk = following_ids[i:i+100]
                users_chunk = api.lookup_users(user_ids=chunk)
                for user in users_chunk:
                    try:
                        ob = {
                        'ID':user.id,
                        'Screen_Name':user.screen_name,
                        'Description': user.description,
                        'StatusesCount':user.statuses_count,
                        'Follower_Count':user.followers_count,
                        'Following_Count':user.friends_count
                    } 
                        data.append(ob)
                    except Exception as ex:
                        print(ex)
                        continue
            except Exception as e:
                print(e)
                print('Something went wrong, skipping...')
                continue
                
        with open(os.path.join(save_path, 'following.json'), 'w') as f:
            json.dump(data, f)

In [None]:
base_anti_hubs_followings_df = get_following_df_from_files(anti_accounts_path)
hand_picked_anti_hubs_followings_df = get_following_df_from_files(hand_picked_new_anti_hubs_path)

base_pro_hubs_followings_df = get_following_df_from_files(pro_accounts_path)
hand_picked_pro_hubs_followings_df = get_following_df_from_files(hand_picked_new_pro_hubs_path)

In [None]:
all_followings = base_anti_hubs_followings_df + hand_picked_anti_hubs_followings_df + base_pro_hubs_followings_df + hand_picked_pro_hubs_followings_df
all_followings = [acc for acc in all_followings if acc.shape[0] < 5000]
all_followings_df = pd.concat(all_followings, ignore_index=True)

In [None]:
all_followings_df

In [None]:
# drop duplicates
all_followings_df = all_followings_df.drop_duplicates(subset = ["Screen_Name"])
# filter only for hubs
only_hubs_df = all_followings_df[(all_followings_df['Follower_Count'] > 100000) & (all_followings_df['Follower_Count'] < 10000000) & (all_followings_df['StatusesCount'] > 2000)]
# clean description for language detectio
only_hubs_df['Description'] = only_hubs_df['Description'].apply(clean_text)
only_hubs_df['Description'] = only_hubs_df.Description.str.replace('[^a-zA-Z0-9]', ' ')
# # take only en accounts
only_hubs_df = only_hubs_df[only_hubs_df['Description'].apply(filter_en_accounts)]
# # remove already downloaded accounts
only_hubs_df = only_hubs_df[only_hubs_df['Screen_Name'].apply(filter_already_downloaded_accounts)]

In [None]:
only_hubs_df

In [None]:
new_hubs_accounts = only_hubs_df['Screen_Name'].tolist()

In [None]:
anti_save_dir_path = r'C:\Users\psrub\Documents\Python\Twitter\new_anti_hubs_auto_1'
pro_save_dir_path = r'C:\Users\psrub\Documents\Python\Twitter\new_pro_hubs_auto_1'

In [None]:
consumerKey = 'qFXNatPD5C2JFEQPlMTXFUr8x'
consumerSecret = 'VCTI9keIp1mqIKLwpuoApI7HSe5b0SpeUHvcQ676J3SOjuuISM'
accessToken = '1371497692069789697-0tx6gputswEwOMlwGfUy4VKBID5SCg'
accessTokenSecret = 'zSFpItEQObd4PBc7E0PFAvoJApglHJqbcT37TW928ji5P'

auth = tweepy.OAuthHandler(consumerKey, consumerSecret)
auth.set_access_token(accessToken, accessTokenSecret)
api = tweepy.API(auth, wait_on_rate_limit=False, wait_on_rate_limit_notify=True)

In [None]:
get_tweets_for_all_accounts(anti_save_dir_path, pro_save_dir_path, new_hubs_accounts)

In [None]:
# NOW DOWNLOAD ALL FOLLOWINGS FOR NEW PRO AND ANTI HUBS AND START AGAIN ITERATION 

In [None]:
new_anti_accounts = os.listdir(anti_save_dir_path)
get_data(anti_save_dir_path, new_anti_accounts)

In [None]:
new_pro_accounts = os.listdir(pro_save_dir_path)
get_data(pro_save_dir_path, new_pro_accounts)

In [None]:
all_accounts += new_pro_accounts
all_accounts += new_anti_accounts

In [None]:
len(all_accounts)

In [None]:
anti_hubs_followings_df = get_following_df_from_files(anti_save_dir_path)
pro_hubs_followings_df = get_following_df_from_files(pro_save_dir_path)

In [None]:
all_followings = anti_hubs_followings_df + pro_hubs_followings_df
all_followings = [acc for acc in all_followings if acc.shape[0] < 5000]
all_followings_df = pd.concat(all_followings, ignore_index=True)

In [None]:
all_followings_df

In [None]:
# drop duplicates
all_followings_df = all_followings_df.drop_duplicates(subset = ["Screen_Name"])
# filter only for hubs
only_hubs_df = all_followings_df[(all_followings_df['Follower_Count'] > 100000) & (all_followings_df['Follower_Count'] < 10000000) & (all_followings_df['StatusesCount'] > 2000)]
# clean description for language detectio
only_hubs_df['Description'] = only_hubs_df['Description'].apply(clean_text)
only_hubs_df['Description'] = only_hubs_df.Description.str.replace('[^a-zA-Z0-9]', ' ')
# # take only en accounts
only_hubs_df = only_hubs_df[only_hubs_df['Description'].apply(filter_en_accounts)]
# # remove already downloaded accounts
only_hubs_df = only_hubs_df[only_hubs_df['Screen_Name'].apply(filter_already_downloaded_accounts)]

In [None]:
only_hubs_df

In [None]:
new_hubs_accounts = only_hubs_df['Screen_Name'].tolist()

In [None]:
anti_save_dir_path = r'C:\Users\psrub\Documents\Python\Twitter\new_anti_hubs_auto_2'
pro_save_dir_path = r'C:\Users\psrub\Documents\Python\Twitter\new_pro_hubs_auto_2'
neutral_save_dir_path = r'C:\Users\psrub\Documents\Python\Twitter\all_neutral_accounts'

In [None]:
consumerKey = 'qFXNatPD5C2JFEQPlMTXFUr8x'
consumerSecret = 'VCTI9keIp1mqIKLwpuoApI7HSe5b0SpeUHvcQ676J3SOjuuISM'
accessToken = '1371497692069789697-0tx6gputswEwOMlwGfUy4VKBID5SCg'
accessTokenSecret = 'zSFpItEQObd4PBc7E0PFAvoJApglHJqbcT37TW928ji5P'

auth = tweepy.OAuthHandler(consumerKey, consumerSecret)
auth.set_access_token(accessToken, accessTokenSecret)
api = tweepy.API(auth, wait_on_rate_limit=False, wait_on_rate_limit_notify=True)

In [None]:
get_tweets_for_all_accounts(anti_save_dir_path, pro_save_dir_path, neutral_save_dir_path, new_hubs_accounts)

In [None]:
new_anti_accounts = os.listdir(anti_save_dir_path)
get_data(anti_save_dir_path, new_anti_accounts)

In [None]:
new_pro_accounts = os.listdir(pro_save_dir_path)
get_data(pro_save_dir_path, new_pro_accounts)

In [None]:
new_neutral_accounts = os.listdir(neutral_save_dir_path)

In [None]:
all_accounts += new_pro_accounts
all_accounts += new_anti_accounts
all_accounts += new_neutral_accounts

In [None]:
len(all_accounts)

In [None]:
anti_hubs_followings_df = get_following_df_from_files(anti_save_dir_path)
pro_hubs_followings_df = get_following_df_from_files(pro_save_dir_path)

In [None]:
all_followings = anti_hubs_followings_df + pro_hubs_followings_df
all_followings = [acc for acc in all_followings if acc.shape[0] < 5000]
all_followings_df = pd.concat(all_followings, ignore_index=True)

In [None]:
all_followings_df

In [None]:
# drop duplicates
all_followings_df = all_followings_df.drop_duplicates(subset = ["Screen_Name"])
# filter only for hubs
only_hubs_df = all_followings_df[(all_followings_df['Follower_Count'] > 100000) & (all_followings_df['Follower_Count'] < 10000000) & (all_followings_df['StatusesCount'] > 2000)]
# clean description for language detectio
only_hubs_df['Description'] = only_hubs_df['Description'].apply(clean_text)
only_hubs_df['Description'] = only_hubs_df.Description.str.replace('[^a-zA-Z0-9]', ' ')
# # take only en accounts
only_hubs_df = only_hubs_df[only_hubs_df['Description'].apply(filter_en_accounts)]
# # remove already downloaded accounts
only_hubs_df = only_hubs_df[only_hubs_df['Screen_Name'].apply(filter_already_downloaded_accounts)]

In [None]:
only_hubs_df

In [None]:
new_hubs_accounts = only_hubs_df['Screen_Name'].tolist()

In [None]:
anti_save_dir_path = r'C:\Users\psrub\Documents\Python\Twitter\new_anti_hubs_auto_3'
pro_save_dir_path = r'C:\Users\psrub\Documents\Python\Twitter\new_pro_hubs_auto_3'
neutral_save_dir_path = r'C:\Users\psrub\Documents\Python\Twitter\all_neutral_accounts'

In [None]:
get_tweets_for_all_accounts(anti_save_dir_path, pro_save_dir_path, neutral_save_dir_path, new_hubs_accounts)

In [None]:
new_anti_accounts = os.listdir(anti_save_dir_path)
get_data(anti_save_dir_path, new_anti_accounts)

In [None]:
new_pro_accounts = os.listdir(pro_save_dir_path)
get_data(pro_save_dir_path, new_pro_accounts)

In [None]:
## all downloaded accounts

# ANTI = BASE HUBS + HAND PICKED HUBS
anti_accounts_path = r'C:\Users\psrub\Documents\Python\Twitter\anti_scientific_data_2'
hand_picked_new_anti_hubs_path = r'C:\Users\psrub\Documents\Python\Twitter\hand_picked_new_anti_hubs'

# PRO = BASE HUBS + HAND PICKED HUBS
pro_accounts_path = r'C:\Users\psrub\Documents\Python\Twitter\pro_scientific_data_2'
hand_picked_new_pro_hubs_path = r'C:\Users\psrub\Documents\Python\Twitter\hand_picked_new_pro_hubs'

with open(r'C:\Users\psrub\Documents\Python\Twitter\files\neutral_accounts.txt', 'r') as f:
    base_neutral_accounts = f.read().split(',')
new_neutral_accounts_path = r'C:\Users\psrub\Documents\Python\Twitter\all_neutral_accounts'

anti_save_dir_path_1 = r'C:\Users\psrub\Documents\Python\Twitter\new_anti_hubs_auto_1'
pro_save_dir_path_1 = r'C:\Users\psrub\Documents\Python\Twitter\new_pro_hubs_auto_1'

anti_save_dir_path_2 = r'C:\Users\psrub\Documents\Python\Twitter\new_anti_hubs_auto_2'
pro_save_dir_path_2 = r'C:\Users\psrub\Documents\Python\Twitter\new_pro_hubs_auto_2'

anti_save_dir_path_3 = r'C:\Users\psrub\Documents\Python\Twitter\new_anti_hubs_auto_3'
pro_save_dir_path_3= r'C:\Users\psrub\Documents\Python\Twitter\new_pro_hubs_auto_3'



# all base accounts which are already downloaded

all_accounts = os.listdir(anti_accounts_path) + os.listdir(hand_picked_new_anti_hubs_path) + os.listdir(pro_accounts_path) \
+ os.listdir(hand_picked_new_pro_hubs_path) + os.listdir(new_neutral_accounts_path) + base_neutral_accounts \
+ os.listdir(anti_save_dir_path_1) + os.listdir(pro_save_dir_path_1) + os.listdir(anti_save_dir_path_2) \
+ os.listdir(pro_save_dir_path_2) + os.listdir(anti_save_dir_path_3) + os.listdir(pro_save_dir_path_3)

all_accounts = [re.sub(r'@','', acc) for acc in all_accounts]

print(len(all_accounts))

In [None]:
anti_hubs_followings_df = get_following_df_from_files(anti_save_dir_path_3)
pro_hubs_followings_df = get_following_df_from_files(pro_save_dir_path_3)

In [None]:
all_followings = anti_hubs_followings_df + pro_hubs_followings_df
all_followings = [acc for acc in all_followings if acc.shape[0] < 5000]
all_followings_df = pd.concat(all_followings, ignore_index=True)

In [None]:
all_followings_df

In [None]:
# drop duplicates
all_followings_df = all_followings_df.drop_duplicates(subset = ["Screen_Name"])
# filter only for hubs
only_hubs_df = all_followings_df[(all_followings_df['Follower_Count'] > 100000) & (all_followings_df['Follower_Count'] < 10000000) & (all_followings_df['StatusesCount'] > 2000)]
# clean description for language detectio
only_hubs_df['Description'] = only_hubs_df['Description'].apply(clean_text)
only_hubs_df['Description'] = only_hubs_df.Description.str.replace('[^a-zA-Z0-9]', ' ')
# # take only en accounts
only_hubs_df = only_hubs_df[only_hubs_df['Description'].apply(filter_en_accounts)]
# # remove already downloaded accounts
only_hubs_df = only_hubs_df[only_hubs_df['Screen_Name'].apply(filter_already_downloaded_accounts)]

In [None]:
only_hubs_df

In [None]:
new_hubs_accounts = only_hubs_df['Screen_Name'].tolist()

In [None]:
anti_save_dir_path_4 = r'C:\Users\psrub\Documents\Python\Twitter\new_anti_hubs_auto_4'
pro_save_dir_path_4 = r'C:\Users\psrub\Documents\Python\Twitter\new_pro_hubs_auto_4'

In [None]:
get_tweets_for_all_accounts(anti_save_dir_path_4, pro_save_dir_path_4, neutral_save_dir_path, new_hubs_accounts)

In [None]:
new_anti_accounts = os.listdir(anti_save_dir_path_4)
get_data(anti_save_dir_path_4, new_anti_accounts)

In [None]:
new_pro_accounts = os.listdir(pro_save_dir_path_4)
get_data(pro_save_dir_path_4, new_pro_accounts)

In [None]:
## all downloaded accounts

# ANTI = BASE HUBS + HAND PICKED HUBS
anti_accounts_path = r'C:\Users\psrub\Documents\Python\Twitter\anti_scientific_data_2'
hand_picked_new_anti_hubs_path = r'C:\Users\psrub\Documents\Python\Twitter\hand_picked_new_anti_hubs'

# PRO = BASE HUBS + HAND PICKED HUBS
pro_accounts_path = r'C:\Users\psrub\Documents\Python\Twitter\pro_scientific_data_2'
hand_picked_new_pro_hubs_path = r'C:\Users\psrub\Documents\Python\Twitter\hand_picked_new_pro_hubs'

with open(r'C:\Users\psrub\Documents\Python\Twitter\files\neutral_accounts.txt', 'r') as f:
    base_neutral_accounts = f.read().split(',')
new_neutral_accounts_path = r'C:\Users\psrub\Documents\Python\Twitter\all_neutral_accounts'

anti_save_dir_path_1 = r'C:\Users\psrub\Documents\Python\Twitter\new_anti_hubs_auto_1'
pro_save_dir_path_1 = r'C:\Users\psrub\Documents\Python\Twitter\new_pro_hubs_auto_1'

anti_save_dir_path_2 = r'C:\Users\psrub\Documents\Python\Twitter\new_anti_hubs_auto_2'
pro_save_dir_path_2 = r'C:\Users\psrub\Documents\Python\Twitter\new_pro_hubs_auto_2'

anti_save_dir_path_3 = r'C:\Users\psrub\Documents\Python\Twitter\new_anti_hubs_auto_3'
pro_save_dir_path_3= r'C:\Users\psrub\Documents\Python\Twitter\new_pro_hubs_auto_3'

anti_save_dir_path_4 = r'C:\Users\psrub\Documents\Python\Twitter\new_anti_hubs_auto_4'
pro_save_dir_path_4= r'C:\Users\psrub\Documents\Python\Twitter\new_pro_hubs_auto_4'


# all base accounts which are already downloaded

all_accounts = os.listdir(anti_accounts_path) + os.listdir(hand_picked_new_anti_hubs_path) + os.listdir(pro_accounts_path) \
+ os.listdir(hand_picked_new_pro_hubs_path) + os.listdir(new_neutral_accounts_path) + base_neutral_accounts \
+ os.listdir(anti_save_dir_path_1) + os.listdir(pro_save_dir_path_1) + os.listdir(anti_save_dir_path_2) \
+ os.listdir(pro_save_dir_path_2) + os.listdir(anti_save_dir_path_3) + os.listdir(pro_save_dir_path_3) \
+ os.listdir(anti_save_dir_path_4) + os.listdir(pro_save_dir_path_4)

all_accounts = [re.sub(r'@','', acc) for acc in all_accounts]

print(len(all_accounts))

In [None]:
anti_hubs_followings_df = get_following_df_from_files(anti_save_dir_path_4)
pro_hubs_followings_df = get_following_df_from_files(pro_save_dir_path_4)

In [None]:
all_followings = anti_hubs_followings_df + pro_hubs_followings_df
all_followings = [acc for acc in all_followings if acc.shape[0] < 5000]
all_followings_df = pd.concat(all_followings, ignore_index=True)

In [None]:
all_followings_df

In [None]:
# drop duplicates
all_followings_df = all_followings_df.drop_duplicates(subset = ["Screen_Name"])
# filter only for hubs
only_hubs_df = all_followings_df[(all_followings_df['Follower_Count'] > 100000) & (all_followings_df['Follower_Count'] < 10000000) & (all_followings_df['StatusesCount'] > 2000)]
# clean description for language detectio
only_hubs_df['Description'] = only_hubs_df['Description'].apply(clean_text)
only_hubs_df['Description'] = only_hubs_df.Description.str.replace('[^a-zA-Z0-9]', ' ')
# # take only en accounts
only_hubs_df = only_hubs_df[only_hubs_df['Description'].apply(filter_en_accounts)]
# # remove already downloaded accounts
only_hubs_df = only_hubs_df[only_hubs_df['Screen_Name'].apply(filter_already_downloaded_accounts, all_acc=all_accounts)]

In [None]:
only_hubs_df

In [None]:
new_hubs_accounts = only_hubs_df['Screen_Name'].tolist()

In [None]:
anti_save_dir_path_5 = r'C:\Users\psrub\Documents\Python\Twitter\new_anti_hubs_auto_5'
pro_save_dir_path_5 = r'C:\Users\psrub\Documents\Python\Twitter\new_pro_hubs_auto_5'

In [None]:
get_tweets_for_all_accounts(anti_save_dir_path_5, pro_save_dir_path_5, neutral_save_dir_path, new_hubs_accounts)

In [None]:
new_anti_accounts = os.listdir(anti_save_dir_path_5)
get_data(anti_save_dir_path_5, new_anti_accounts)

In [None]:
new_pro_accounts = os.listdir(pro_save_dir_path_5)
get_data(pro_save_dir_path_5, new_pro_accounts)

In [None]:
## all downloaded accounts

# ANTI = BASE HUBS + HAND PICKED HUBS
anti_accounts_path = r'C:\Users\psrub\Documents\Python\Twitter\anti_scientific_data_2'
hand_picked_new_anti_hubs_path = r'C:\Users\psrub\Documents\Python\Twitter\hand_picked_new_anti_hubs'

# PRO = BASE HUBS + HAND PICKED HUBS
pro_accounts_path = r'C:\Users\psrub\Documents\Python\Twitter\pro_scientific_data_2'
hand_picked_new_pro_hubs_path = r'C:\Users\psrub\Documents\Python\Twitter\hand_picked_new_pro_hubs'

with open(r'C:\Users\psrub\Documents\Python\Twitter\files\neutral_accounts.txt', 'r') as f:
    base_neutral_accounts = f.read().split(',')
new_neutral_accounts_path = r'C:\Users\psrub\Documents\Python\Twitter\all_neutral_accounts'

anti_save_dir_path_1 = r'C:\Users\psrub\Documents\Python\Twitter\new_anti_hubs_auto_1'
pro_save_dir_path_1 = r'C:\Users\psrub\Documents\Python\Twitter\new_pro_hubs_auto_1'

anti_save_dir_path_2 = r'C:\Users\psrub\Documents\Python\Twitter\new_anti_hubs_auto_2'
pro_save_dir_path_2 = r'C:\Users\psrub\Documents\Python\Twitter\new_pro_hubs_auto_2'

anti_save_dir_path_3 = r'C:\Users\psrub\Documents\Python\Twitter\new_anti_hubs_auto_3'
pro_save_dir_path_3 = r'C:\Users\psrub\Documents\Python\Twitter\new_pro_hubs_auto_3'

anti_save_dir_path_4 = r'C:\Users\psrub\Documents\Python\Twitter\new_anti_hubs_auto_4'
pro_save_dir_path_4 = r'C:\Users\psrub\Documents\Python\Twitter\new_pro_hubs_auto_4'

anti_save_dir_path_5 = r'C:\Users\psrub\Documents\Python\Twitter\new_anti_hubs_auto_5'
pro_save_dir_path_5 = r'C:\Users\psrub\Documents\Python\Twitter\new_pro_hubs_auto_5'


# all base accounts which are already downloaded

all_accounts = os.listdir(anti_accounts_path) + os.listdir(hand_picked_new_anti_hubs_path) + os.listdir(pro_accounts_path) \
+ os.listdir(hand_picked_new_pro_hubs_path) + os.listdir(new_neutral_accounts_path) + base_neutral_accounts \
+ os.listdir(anti_save_dir_path_1) + os.listdir(pro_save_dir_path_1) + os.listdir(anti_save_dir_path_2) \
+ os.listdir(pro_save_dir_path_2) + os.listdir(anti_save_dir_path_3) + os.listdir(pro_save_dir_path_3) \
+ os.listdir(anti_save_dir_path_4) + os.listdir(pro_save_dir_path_4) \
+ os.listdir(anti_save_dir_path_5) + os.listdir(pro_save_dir_path_5)

all_accounts = [re.sub(r'@','', acc) for acc in all_accounts]

print(len(all_accounts))

In [None]:
anti_hubs_followings_df = get_following_df_from_files(anti_save_dir_path_5)
pro_hubs_followings_df = get_following_df_from_files(pro_save_dir_path_5)

In [None]:
all_followings = anti_hubs_followings_df + pro_hubs_followings_df
all_followings = [acc for acc in all_followings if acc.shape[0] < 5000]
all_followings_df = pd.concat(all_followings, ignore_index=True)

In [None]:
all_followings_df

In [None]:
# drop duplicates
all_followings_df = all_followings_df.drop_duplicates(subset = ["Screen_Name"])
# filter only for hubs
only_hubs_df = all_followings_df[(all_followings_df['Follower_Count'] > 100000) & (all_followings_df['Follower_Count'] < 10000000) & (all_followings_df['StatusesCount'] > 2000)]
# clean description for language detectio
only_hubs_df['Description'] = only_hubs_df['Description'].apply(clean_text)
only_hubs_df['Description'] = only_hubs_df.Description.str.replace('[^a-zA-Z0-9]', ' ')
# # take only en accounts
only_hubs_df = only_hubs_df[only_hubs_df['Description'].apply(filter_en_accounts)]
# # remove already downloaded accounts
only_hubs_df = only_hubs_df[only_hubs_df['Screen_Name'].apply(filter_already_downloaded_accounts, all_acc=all_accounts)]

In [None]:
only_hubs_df

In [None]:
new_hubs_accounts = only_hubs_df['Screen_Name'].tolist()

In [None]:
anti_save_dir_path_6 = r'C:\Users\psrub\Documents\Python\Twitter\new_anti_hubs_auto_6'
pro_save_dir_path_6 = r'C:\Users\psrub\Documents\Python\Twitter\new_pro_hubs_auto_6'

In [None]:
get_tweets_for_all_accounts(anti_save_dir_path_6, pro_save_dir_path_6, neutral_save_dir_path, new_hubs_accounts)

In [None]:
new_anti_accounts = os.listdir(anti_save_dir_path_6)
get_data(anti_save_dir_path_6, new_anti_accounts)

In [None]:
new_pro_accounts = os.listdir(pro_save_dir_path_6)
get_data(pro_save_dir_path_6, new_pro_accounts)

In [None]:
## all downloaded accounts

# ANTI = BASE HUBS + HAND PICKED HUBS
anti_accounts_path = r'C:\Users\psrub\Documents\Python\Twitter\anti_scientific_data_2'
hand_picked_new_anti_hubs_path = r'C:\Users\psrub\Documents\Python\Twitter\hand_picked_new_anti_hubs'

# PRO = BASE HUBS + HAND PICKED HUBS
pro_accounts_path = r'C:\Users\psrub\Documents\Python\Twitter\pro_scientific_data_2'
hand_picked_new_pro_hubs_path = r'C:\Users\psrub\Documents\Python\Twitter\hand_picked_new_pro_hubs'

with open(r'C:\Users\psrub\Documents\Python\Twitter\files\neutral_accounts.txt', 'r') as f:
    base_neutral_accounts = f.read().split(',')
new_neutral_accounts_path = r'C:\Users\psrub\Documents\Python\Twitter\all_neutral_accounts'

anti_save_dir_path_1 = r'C:\Users\psrub\Documents\Python\Twitter\new_anti_hubs_auto_1'
pro_save_dir_path_1 = r'C:\Users\psrub\Documents\Python\Twitter\new_pro_hubs_auto_1'

anti_save_dir_path_2 = r'C:\Users\psrub\Documents\Python\Twitter\new_anti_hubs_auto_2'
pro_save_dir_path_2 = r'C:\Users\psrub\Documents\Python\Twitter\new_pro_hubs_auto_2'

anti_save_dir_path_3 = r'C:\Users\psrub\Documents\Python\Twitter\new_anti_hubs_auto_3'
pro_save_dir_path_3 = r'C:\Users\psrub\Documents\Python\Twitter\new_pro_hubs_auto_3'

anti_save_dir_path_4 = r'C:\Users\psrub\Documents\Python\Twitter\new_anti_hubs_auto_4'
pro_save_dir_path_4 = r'C:\Users\psrub\Documents\Python\Twitter\new_pro_hubs_auto_4'

anti_save_dir_path_5 = r'C:\Users\psrub\Documents\Python\Twitter\new_anti_hubs_auto_5'
pro_save_dir_path_5 = r'C:\Users\psrub\Documents\Python\Twitter\new_pro_hubs_auto_5'

anti_save_dir_path_6 = r'C:\Users\psrub\Documents\Python\Twitter\new_anti_hubs_auto_6'
pro_save_dir_path_6 = r'C:\Users\psrub\Documents\Python\Twitter\new_pro_hubs_auto_6'


# all base accounts which are already downloaded

all_accounts = os.listdir(anti_accounts_path) + os.listdir(hand_picked_new_anti_hubs_path) + os.listdir(pro_accounts_path) \
+ os.listdir(hand_picked_new_pro_hubs_path) + os.listdir(new_neutral_accounts_path) + base_neutral_accounts \
+ os.listdir(anti_save_dir_path_1) + os.listdir(pro_save_dir_path_1) + os.listdir(anti_save_dir_path_2) \
+ os.listdir(pro_save_dir_path_2) + os.listdir(anti_save_dir_path_3) + os.listdir(pro_save_dir_path_3) \
+ os.listdir(anti_save_dir_path_4) + os.listdir(pro_save_dir_path_4) \
+ os.listdir(anti_save_dir_path_5) + os.listdir(pro_save_dir_path_5) \
+ os.listdir(anti_save_dir_path_6) + os.listdir(pro_save_dir_path_6)

all_accounts = [re.sub(r'@','', acc) for acc in all_accounts]

print(len(all_accounts))

In [None]:
anti_hubs_followings_df = get_following_df_from_files(anti_save_dir_path_6)
pro_hubs_followings_df = get_following_df_from_files(pro_save_dir_path_6)

In [None]:
all_followings = anti_hubs_followings_df + pro_hubs_followings_df
all_followings = [acc for acc in all_followings if acc.shape[0] < 5000]
all_followings_df = pd.concat(all_followings, ignore_index=True)

In [None]:
all_followings_df

In [None]:
# drop duplicates
all_followings_df = all_followings_df.drop_duplicates(subset = ["Screen_Name"])
# filter only for hubs
only_hubs_df = all_followings_df[(all_followings_df['Follower_Count'] > 100000) & (all_followings_df['Follower_Count'] < 10000000) & (all_followings_df['StatusesCount'] > 2000)]
# clean description for language detectio
only_hubs_df['Description'] = only_hubs_df['Description'].apply(clean_text)
only_hubs_df['Description'] = only_hubs_df.Description.str.replace('[^a-zA-Z0-9]', ' ')
# # take only en accounts
only_hubs_df = only_hubs_df[only_hubs_df['Description'].apply(filter_en_accounts)]
# # remove already downloaded accounts
only_hubs_df = only_hubs_df[only_hubs_df['Screen_Name'].apply(filter_already_downloaded_accounts, all_acc=all_accounts)]

In [None]:
only_hubs_df

In [None]:
new_hubs_accounts = only_hubs_df['Screen_Name'].tolist()

In [None]:
anti_save_dir_path_7 = r'C:\Users\psrub\Documents\Python\Twitter\new_anti_hubs_auto_7'
pro_save_dir_path_7 = r'C:\Users\psrub\Documents\Python\Twitter\new_pro_hubs_auto_7'

In [None]:
get_tweets_for_all_accounts(anti_save_dir_path_7, pro_save_dir_path_7, neutral_save_dir_path, new_hubs_accounts)

In [None]:
new_anti_accounts = os.listdir(anti_save_dir_path_7)
get_data(anti_save_dir_path_7, new_anti_accounts)

In [None]:
new_pro_accounts = os.listdir(pro_save_dir_path_7)
get_data(pro_save_dir_path_7, new_pro_accounts)

In [None]:
## all downloaded accounts

# ANTI = BASE HUBS + HAND PICKED HUBS
anti_accounts_path = r'C:\Users\psrub\Documents\Python\Twitter\anti_scientific_data_2'
hand_picked_new_anti_hubs_path = r'C:\Users\psrub\Documents\Python\Twitter\hand_picked_new_anti_hubs'

# PRO = BASE HUBS + HAND PICKED HUBS
pro_accounts_path = r'C:\Users\psrub\Documents\Python\Twitter\pro_scientific_data_2'
hand_picked_new_pro_hubs_path = r'C:\Users\psrub\Documents\Python\Twitter\hand_picked_new_pro_hubs'

with open(r'C:\Users\psrub\Documents\Python\Twitter\files\neutral_accounts.txt', 'r') as f:
    base_neutral_accounts = f.read().split(',')
new_neutral_accounts_path = r'C:\Users\psrub\Documents\Python\Twitter\all_neutral_accounts'

anti_save_dir_path_1 = r'C:\Users\psrub\Documents\Python\Twitter\new_anti_hubs_auto_1'
pro_save_dir_path_1 = r'C:\Users\psrub\Documents\Python\Twitter\new_pro_hubs_auto_1'

anti_save_dir_path_2 = r'C:\Users\psrub\Documents\Python\Twitter\new_anti_hubs_auto_2'
pro_save_dir_path_2 = r'C:\Users\psrub\Documents\Python\Twitter\new_pro_hubs_auto_2'

anti_save_dir_path_3 = r'C:\Users\psrub\Documents\Python\Twitter\new_anti_hubs_auto_3'
pro_save_dir_path_3 = r'C:\Users\psrub\Documents\Python\Twitter\new_pro_hubs_auto_3'

anti_save_dir_path_4 = r'C:\Users\psrub\Documents\Python\Twitter\new_anti_hubs_auto_4'
pro_save_dir_path_4 = r'C:\Users\psrub\Documents\Python\Twitter\new_pro_hubs_auto_4'

anti_save_dir_path_5 = r'C:\Users\psrub\Documents\Python\Twitter\new_anti_hubs_auto_5'
pro_save_dir_path_5 = r'C:\Users\psrub\Documents\Python\Twitter\new_pro_hubs_auto_5'

anti_save_dir_path_6 = r'C:\Users\psrub\Documents\Python\Twitter\new_anti_hubs_auto_6'
pro_save_dir_path_6 = r'C:\Users\psrub\Documents\Python\Twitter\new_pro_hubs_auto_6'

anti_save_dir_path_7 = r'C:\Users\psrub\Documents\Python\Twitter\new_anti_hubs_auto_7'
pro_save_dir_path_7 = r'C:\Users\psrub\Documents\Python\Twitter\new_pro_hubs_auto_7'


# all base accounts which are already downloaded

all_accounts = os.listdir(anti_accounts_path) + os.listdir(hand_picked_new_anti_hubs_path) + os.listdir(pro_accounts_path) \
+ os.listdir(hand_picked_new_pro_hubs_path) + os.listdir(new_neutral_accounts_path) + base_neutral_accounts \
+ os.listdir(anti_save_dir_path_1) + os.listdir(pro_save_dir_path_1) + os.listdir(anti_save_dir_path_2) \
+ os.listdir(pro_save_dir_path_2) + os.listdir(anti_save_dir_path_3) + os.listdir(pro_save_dir_path_3) \
+ os.listdir(anti_save_dir_path_4) + os.listdir(pro_save_dir_path_4) \
+ os.listdir(anti_save_dir_path_5) + os.listdir(pro_save_dir_path_5) \
+ os.listdir(anti_save_dir_path_6) + os.listdir(pro_save_dir_path_6) \
+ os.listdir(anti_save_dir_path_7) + os.listdir(pro_save_dir_path_7)

all_accounts = [re.sub(r'@','', acc) for acc in all_accounts]

print(len(all_accounts))

In [None]:
anti_hubs_followings_df = get_following_df_from_files(anti_save_dir_path_7)
pro_hubs_followings_df = get_following_df_from_files(pro_save_dir_path_7)

In [None]:
all_followings = anti_hubs_followings_df + pro_hubs_followings_df
all_followings = [acc for acc in all_followings if acc.shape[0] < 5000]
all_followings_df = pd.concat(all_followings, ignore_index=True)

In [None]:
all_followings_df

In [None]:
# drop duplicates
all_followings_df = all_followings_df.drop_duplicates(subset = ["Screen_Name"])
# filter only for hubs
only_hubs_df = all_followings_df[(all_followings_df['Follower_Count'] > 100000) & (all_followings_df['Follower_Count'] < 10000000) & (all_followings_df['StatusesCount'] > 2000)]
# clean description for language detectio
only_hubs_df['Description'] = only_hubs_df['Description'].apply(clean_text)
only_hubs_df['Description'] = only_hubs_df.Description.str.replace('[^a-zA-Z0-9]', ' ')
# # take only en accounts
only_hubs_df = only_hubs_df[only_hubs_df['Description'].apply(filter_en_accounts)]
# # remove already downloaded accounts
only_hubs_df = only_hubs_df[only_hubs_df['Screen_Name'].apply(filter_already_downloaded_accounts, all_acc=all_accounts)]

In [None]:
only_hubs_df

In [None]:
new_hubs_accounts = only_hubs_df['Screen_Name'].tolist()

In [None]:
anti_save_dir_path_8 = r'C:\Users\psrub\Documents\Python\Twitter\new_anti_hubs_auto_8'
pro_save_dir_path_8 = r'C:\Users\psrub\Documents\Python\Twitter\new_pro_hubs_auto_8'

In [None]:
get_tweets_for_all_accounts(anti_save_dir_path_8, pro_save_dir_path_8, neutral_save_dir_path, new_hubs_accounts)

In [None]:
new_anti_accounts = os.listdir(anti_save_dir_path_8)
get_data(anti_save_dir_path_8, new_anti_accounts)

In [None]:
new_pro_accounts = os.listdir(pro_save_dir_path_8)
get_data(pro_save_dir_path_8, new_pro_accounts)

In [None]:
anti_hubs_followings_df = get_following_df_from_files(anti_save_dir_path_7)
pro_hubs_followings_df = get_following_df_from_files(pro_save_dir_path_7)

In [None]:
## all downloaded accounts

# ANTI = BASE HUBS + HAND PICKED HUBS
base_anti_accounts_path = r'E:\Twitter_data\AUTOMATED_ITERATION_3\base_anti_hubs'
hand_picked_new_anti_hubs_path = r'E:\Twitter_data\AUTOMATED_ITERATION_3\hand_picked_base_anti_hubs'

# PRO = BASE HUBS + HAND PICKED HUBS
base_pro_accounts_path = r'E:\Twitter_data\AUTOMATED_ITERATION_3\base_pro_hubs'
hand_picked_new_pro_hubs_path = r'E:\Twitter_data\AUTOMATED_ITERATION_3\hand_picked_base_pro_hubs'


anti_save_dir_path_1 = r'E:\Twitter_data\AUTOMATED_ITERATION_2\new_anti_hubs_1'
pro_save_dir_path_1 = r'E:\Twitter_data\AUTOMATED_ITERATION_2\new_pro_hubs_1'

anti_save_dir_path_2 = r'E:\Twitter_data\AUTOMATED_ITERATION_2\new_anti_hubs_2'
pro_save_dir_path_2 = r'E:\Twitter_data\AUTOMATED_ITERATION_2\new_pro_hubs_2'

anti_save_dir_path_3 = r'E:\Twitter_data\AUTOMATED_ITERATION_2\new_anti_hubs_3'
pro_save_dir_path_3 = r'E:\Twitter_data\AUTOMATED_ITERATION_2\new_pro_hubs_3'

anti_save_dir_path_4 = r'E:\Twitter_data\AUTOMATED_ITERATION_2\new_anti_hubs_4'
pro_save_dir_path_4 = r'E:\Twitter_data\AUTOMATED_ITERATION_2\new_pro_hubs_4'

anti_save_dir_path_5 = r'E:\Twitter_data\AUTOMATED_ITERATION_2\new_anti_hubs_5'
pro_save_dir_path_5 = r'E:\Twitter_data\AUTOMATED_ITERATION_2\new_pro_hubs_5'


# all base accounts which are already downloaded

all_accounts = os.listdir(anti_accounts_path) + os.listdir(hand_picked_new_anti_hubs_path) + os.listdir(pro_accounts_path) \
+ os.listdir(hand_picked_new_pro_hubs_path)  + os.listdir(anti_save_dir_path_1) + os.listdir(pro_save_dir_path_1) \
+ os.listdir(anti_save_dir_path_2) + os.listdir(pro_save_dir_path_2) + os.listdir(anti_save_dir_path_3) \
+ os.listdir(pro_save_dir_path_3) + os.listdir(anti_save_dir_path_4) + os.listdir(pro_save_dir_path_4) \
+ os.listdir(anti_save_dir_path_5) + os.listdir(pro_save_dir_path_5)

all_accounts = [re.sub(r'@','', acc) for acc in all_accounts]

print(len(all_accounts))

In [None]:
base_anti_hubs_followings_df = get_following_df_from_files(base_anti_accounts_path)
hand_picked_anti_hubs_followings_df = get_following_df_from_files(hand_picked_new_anti_hubs_path)

base_pro_hubs_followings_df = get_following_df_from_files(base_pro_accounts_path)
hand_picked_pro_hubs_followings_df = get_following_df_from_files(hand_picked_new_pro_hubs_path)

anti_hubs_followings_df = get_following_df_from_files(anti_save_dir_path_2)
pro_hubs_followings_df = get_following_df_from_files(pro_save_dir_path_2)


all_followings = anti_hubs_followings_df + pro_hubs_followings_df
# drop accounts which have more than 5000 followings
all_followings = [acc for acc in all_followings if acc.shape[0] < 5000]
all_followings_df = pd.concat(all_followings, ignore_index=True)
all_followings_df = all_followings_df.drop_duplicates(subset = ["Screen_Name"])
print(all_followings_df.shape)

In [None]:
# BASE ACCOUNTS      
# ANTI = BASE HUBS + HAND PICKED HUBS
base_anti_accounts_path = r'E:\Twitter_data\BASE_HUBS\anti_scientific_data_2'
hand_picked_new_anti_hubs_path = r'E:\Twitter_data\BASE_HUBS\hand_picked_new_anti_hubs'

# PRO = BASE HUBS + HAND PICKED HUBS
base_pro_accounts_path = r'E:\Twitter_data\BASE_HUBS\pro_scientific_data_2'
hand_picked_new_pro_hubs_path = r'E:\Twitter_data\BASE_HUBS\hand_picked_new_pro_hubs'

with open(r'C:\Users\psrub\Documents\Python\Twitter\files\neutral_accounts.txt', 'r') as f:
    base_neutral_accounts = f.read().split(',')
new_neutral_accounts_path = r'E:\Twitter_data\BASE_HUBS\neutral_data_2'

all_base_accounts = os.listdir(base_anti_accounts_path) + os.listdir(hand_picked_new_anti_hubs_path) \
+ os.listdir(base_pro_accounts_path) + os.listdir(hand_picked_new_pro_hubs_path) + os.listdir(new_neutral_accounts_path) \
+ base_neutral_accounts
all_base_accounts = [re.sub(r'@','', acc) for acc in all_base_accounts]

In [None]:
def get_full_data_iteration():
    
    consumerKey = 'qFXNatPD5C2JFEQPlMTXFUr8x'
    consumerSecret = 'VCTI9keIp1mqIKLwpuoApI7HSe5b0SpeUHvcQ676J3SOjuuISM'
    accessToken = '1371497692069789697-0tx6gputswEwOMlwGfUy4VKBID5SCg'
    accessTokenSecret = 'zSFpItEQObd4PBc7E0PFAvoJApglHJqbcT37TW928ji5P'

    auth = tweepy.OAuthHandler(consumerKey, consumerSecret)
    auth.set_access_token(accessToken, accessTokenSecret)
    api = tweepy.API(auth, wait_on_rate_limit=False, wait_on_rate_limit_notify=True)
    
    
    # BASE ACCOUNTS      
    # ANTI = BASE HUBS + HAND PICKED HUBS
    base_anti_accounts_path = r'E:\Twitter_data\AUTOMATED_ITERATION_3\base_anti_hubs'
    hand_picked_new_anti_hubs_path = r'E:\Twitter_data\AUTOMATED_ITERATION_3\hand_picked_base_anti_hubs'

    # PRO = BASE HUBS + HAND PICKED HUBS
    base_pro_accounts_path = r'E:\Twitter_data\AUTOMATED_ITERATION_3\base_pro_hubs'
    hand_picked_new_pro_hubs_path = r'E:\Twitter_data\AUTOMATED_ITERATION_3\hand_picked_base_pro_hubs'

    with open(r'C:\Users\psrub\Documents\Python\Twitter\files\neutral_accounts.txt', 'r') as f:
        base_neutral_accounts = f.read().split(',')
    new_neutral_accounts_path = r'E:\Twitter_data\AUTOMATED_ITERATION_3\all_neutral_accounts'
    
    all_base_accounts = os.listdir(base_anti_accounts_path) + os.listdir(hand_picked_new_anti_hubs_path) \
    + os.listdir(base_pro_accounts_path) + os.listdir(hand_picked_new_pro_hubs_path) + os.listdir(new_neutral_accounts_path) \
    + base_neutral_accounts
    all_base_accounts = [re.sub(r'@','', acc) for acc in all_base_accounts]
    
    
    # PATHS
    all_accounts_path = r'E:\Twitter_data\AUTOMATED_ITERATION_3'
    
    neutral_accounts_path = r'E:\Twitter_data\AUTOMATED_ITERATION_3\all_neutral_accounts'
    anti_accounts_path_base = r'E:\Twitter_data\AUTOMATED_ITERATION_3\new_anti_hubs_'
    pro_accounts_path_base = r'E:\Twitter_data\AUTOMATED_ITERATION_3\new_pro_hubs_'
        
    for i in range(1, 20):
        
        # NEW ACCOUNTS 
        all_new_accounts = []
        for acc_dir in os.listdir(all_accounts_path):
            acc_dir_path = os.path.join(all_accounts_path, acc_dir)
            accs = os.listdir(acc_dir_path)
            all_new_accounts.extend(accs)
        
        all_accounts = all_base_accounts + all_new_accounts
              
        if i == 1:
            
            base_anti_hubs_followings_df = get_following_df_from_files(base_anti_accounts_path)
            hand_picked_anti_hubs_followings_df = get_following_df_from_files(hand_picked_new_anti_hubs_path)

            base_pro_hubs_followings_df = get_following_df_from_files(base_pro_accounts_path)
            hand_picked_pro_hubs_followings_df = get_following_df_from_files(hand_picked_new_pro_hubs_path)
            
            all_followings = base_anti_hubs_followings_df + hand_picked_anti_hubs_followings_df + base_pro_hubs_followings_df \
            + hand_picked_pro_hubs_followings_df
            # drop accounts which have more than 5000 followings
            all_followings = [acc for acc in all_followings if acc.shape[0] < 5000]
            all_followings_df = pd.concat(all_followings, ignore_index=True)
            
            
            # drop duplicates
            all_followings_df = all_followings_df.drop_duplicates(subset = ["Screen_Name"])
            # filter only for hubs
            only_hubs_df = all_followings_df[(all_followings_df['Follower_Count'] > 100000) & (all_followings_df['Follower_Count'] < 10000000) & (all_followings_df['StatusesCount'] > 2000)]
            # clean description for language detection
            only_hubs_df['Description'] = only_hubs_df['Description'].apply(clean_text)
            only_hubs_df['Description'] = only_hubs_df.Description.str.replace('[^a-zA-Z0-9]', ' ')
            # # take only en accounts
            only_hubs_df = only_hubs_df[only_hubs_df['Description'].apply(filter_en_accounts)]
            # # remove already downloaded accounts
            only_hubs_df = only_hubs_df[only_hubs_df['Screen_Name'].apply(filter_already_downloaded_accounts, all_acc=all_accounts)]
            
            new_hubs_accounts = only_hubs_df['Screen_Name'].tolist()      
            
            anti_save_dir = f'{anti_accounts_path_base}{int(i)}'
            pro_save_dir = f'{pro_accounts_path_base}{int(i)}'
            
            if not os.path.exists(anti_save_dir):
                os.mkdir(anti_save_dir)
            if not os.path.exists(pro_save_dir):
                os.mkdir(pro_save_dir)
            
            get_tweets_for_all_accounts(api, anti_save_dir, pro_save_dir, neutral_accounts_path, new_hubs_accounts)
            
            new_anti_accounts = os.listdir(anti_save_dir)
            new_pro_accounts = os.listdir(pro_save_dir)
               
            get_data(api, anti_save_dir, new_anti_accounts)
            get_data(api, pro_save_dir, new_pro_accounts)
            
            
        else:
            
            anti_save_dir = f'{anti_accounts_path_base}{int(i-1)}'
            pro_save_dir = f'{pro_accounts_path_base}{int(i-1)}'
            
            anti_hubs_followings_df = get_following_df_from_files(anti_save_dir)
            pro_hubs_followings_df = get_following_df_from_files(pro_save_dir)
            
            all_followings = anti_hubs_followings_df + pro_hubs_followings_df
            # drop accounts which have more than 5000 followings
            all_followings = [acc for acc in all_followings if acc.shape[0] < 5000]
            all_followings_df = pd.concat(all_followings, ignore_index=True)
            
            # drop duplicates
            all_followings_df = all_followings_df.drop_duplicates(subset = ["Screen_Name"])
            # filter only for hubs
            only_hubs_df = all_followings_df[(all_followings_df['Follower_Count'] > 100000) & (all_followings_df['Follower_Count'] < 10000000) & (all_followings_df['StatusesCount'] > 2000)]
            # clean description for language detectio
            only_hubs_df['Description'] = only_hubs_df['Description'].apply(clean_text)
            only_hubs_df['Description'] = only_hubs_df.Description.str.replace('[^a-zA-Z0-9]', ' ')
            # # take only en accounts
            only_hubs_df = only_hubs_df[only_hubs_df['Description'].apply(filter_en_accounts)]
            # # remove already downloaded accounts
            only_hubs_df = only_hubs_df[only_hubs_df['Screen_Name'].apply(filter_already_downloaded_accounts, all_acc=all_accounts)]
            
            new_hubs_accounts = only_hubs_df['Screen_Name'].tolist()
            
            anti_save_dir = f'{anti_accounts_path_base}{int(i)}'
            pro_save_dir = f'{pro_accounts_path_base}{int(i)}'
            
            if not os.path.exists(anti_save_dir):
                os.mkdir(anti_save_dir)
            if not os.path.exists(pro_save_dir):
                os.mkdir(pro_save_dir)
            
            get_tweets_for_all_accounts(api, anti_save_dir, pro_save_dir, neutral_accounts_path, new_hubs_accounts)
            
            new_anti_accounts = os.listdir(anti_save_dir)
            new_pro_accounts = os.listdir(pro_save_dir)

            
            # end of iterations 
            if len(new_anti_accounts) == 0 and len(new_pro_accounts) == 0:
                break
            
            get_data(api, anti_save_dir, new_anti_accounts)
            get_data(api, pro_save_dir, new_pro_accounts)

In [None]:
get_full_data_iteration()
print('end of processing')