# <center>Twitter dataset downloader</center>

## <center>IMPORTS</center>

In [1]:
import re
from typing import Dict, List

import bs4
from lxml import html
import numpy as np
import requests
import twint
import pandas as pd
from collections import Counter
from itertools import chain
from IPython.core.display import HTML, display
from tqdm.autonotebook import tqdm, trange



In [2]:
display(HTML("<style>.container { width:100% !important; }</style>"))

## Downloading usernames
> usernames are downloaded from https://fanpagelist.com/. Users are nicely categorized into interesting for us groups.
> Defined categories can be found on the page of course.
> There is only one problem. Counts of users for main categories:

<table><tr><th>Category</th><th>Count</th></tr><tr><td>news/politics</td><td>2</td></tr><tr><td>sports-teams</td><td>171</td></tr><tr><td>tv-shows/cw</td><td>4</td></tr><tr><td>news</td><td>79</td></tr><tr><td>athletes/tennis</td><td>94</td></tr><tr><td>tv-shows/fox</td><td>30</td></tr><tr><td>tv-shows/amc</td><td>5</td></tr><tr><td>tv-shows/travel-channel</td><td>2</td></tr><tr><td>tv-shows/bravo</td><td>2</td></tr><tr><td>organizations/nonprofits</td><td>10</td></tr><tr><td>tv-shows/discovery</td><td>3</td></tr><tr><td>athletes</td><td>181</td></tr><tr><td>brands/dining</td><td>31</td></tr><tr><td>athletes/pro-wrestling</td><td>153</td></tr><tr><td>brands/media</td><td>81</td></tr><tr><td>games</td><td>27</td></tr><tr><td>tv-shows/nickelodeon</td><td>3</td></tr><tr><td>facebook</td><td>1</td></tr><tr><td>musicians</td><td>200</td></tr><tr><td>tv-hosts</td><td>47</td></tr><tr><td>athletes/nba</td><td>20</td></tr><tr><td>tv-shows/mtv</td><td>2</td></tr><tr><td>organizations</td><td>191</td></tr><tr><td>products</td><td>152</td></tr><tr><td>brands/retail</td><td>186</td></tr><tr><td>organizations/colleges</td><td>45</td></tr><tr><td>tv-shows/tlc</td><td>1</td></tr><tr><td>athletes/wwe-divas</td><td>8</td></tr><tr><td>brands/food</td><td>12</td></tr><tr><td>athletes/soccer</td><td>82</td></tr><tr><td>movies</td><td>84</td></tr><tr><td>corporate_brands</td><td>53</td></tr><tr><td>organizations/tourism</td><td>6</td></tr><tr><td>politicians</td><td>41</td></tr><tr><td>brands/technology</td><td>64</td></tr><tr><td>actors</td><td>141</td></tr><tr><td>celebrities</td><td>27</td></tr><tr><td>tv-shows</td><td>129</td></tr><tr><td>athletes/wwe-nxt</td><td>10</td></tr><tr><td>tv-shows/comedy-central</td><td>6</td></tr><tr><td>athletes/olympics</td><td>970</td></tr><tr><td>tv-shows/cbs</td><td>9</td></tr><tr><td>foursquare_brands</td><td>30</td></tr><tr><td>athletes/other-sports</td><td>1</td></tr><tr><td>tv-shows/abc</td><td>10</td></tr></table>

> So as you can see, if we want to use these username dataset, we have to use only general groups to prevent bullshit results.
> Of course we can find somehow twitter nicks of people listed on Wikipedia, but I think it would be tedious.

In [3]:
def get_twitter_urls(category: str, min_no_of_users: int) -> List[str]:
    urls, page_no = list(), 0
    users_per_page = 20 # I think it's the count of users per page, but not sure. Whatever, it's just for progress bar xD
    
    with tqdm(desc=f'Downloding page of {category}', total=int(np.ceil(min_no_of_users / users_per_page)), leave=False) as ubar:
        while len(urls) < min_no_of_users:
            page_no += 1
            page = requests.get(f'https://fanpagelist.com/category/{category}/view/list/sort/followers/page{page_no}')
            ubar.update(1)
            if not page.ok:
                break
            
            twitter_url_xpath = '//li[@class="ranking_results"]//a[@class="clicky_ignore"]/@href'
            tweeter_urls = html.fromstring(page.content).xpath(twitter_url_xpath)
            if not tweeter_urls:
                break
            urls.extend(tweeter_urls)
            ubar.set_postfix(len=len(urls))
    return urls

In [4]:
def add_nicknames_from_urls(urls: List[str], usernames: Dict[str, str], category: str):
    username_re = re.compile(r'.*screen_name=([\w\d-]+)')
    with tqdm(urls, desc='Extracting usernames', leave=False) as ubar:
        for url in ubar:
            matched = username_re.match(url)
            if not matched:
                continue
            matched = matched.group(1)
#             if all(matched not in group for group in usernames.values()):
            usernames[category].append(matched)
            ubar.set_postfix(username=matched)

In [11]:
def summary_html(summary: Dict[str, str], title: str) -> str:            
    display(title)
    html = f'<table><tr><th>Category</th><th>Count</th></tr>'
    for k, v in summary.items():
        if len(v):
            html += f'<tr><td>{k}</td><td>{len(v)}</td></tr>'
    html += '</table>'
    return html

In [10]:
def download_category(*categories: List[str], min_no_of_users: int = 100) -> List[str]:
    usernames = {category: list() for category in categories}
    
    with tqdm(categories, desc='Getting category users', leave=False) as cbar:
        for category in cbar:
            cbar.set_postfix(category=category)
            urls = get_twitter_urls(category, min_no_of_users)
            if not urls:
                raise Exception(f'Cannot download html for category {category}')
            add_nicknames_from_urls(urls, usernames, category)
        
    html = summary_html(usernames, 'Summary')
    
    return usernames, html

In [7]:
def find_categories_on_main_page(url: str) -> List[str]:
    tweeter_urls = html.fromstring(requests.get(url).content).xpath('//@href')
    cats = []
    for cat in tqdm(tweeter_urls, 'Extract categories', leave=False):
        matched = re.match(r'.*category/(.*)', cat)
        if not matched:
            continue
        matched = matched.groups()[0]
        cats.append(matched)
    return list(filter(lambda cat: not cat.startswith(r'top_users') and not re.findall('view', cat), cats))

categories = [find_categories_on_main_page(f'https://fanpagelist.com/category/{cat}') for cat in find_categories_on_main_page('https://fanpagelist.com/')]
subcats_tmp = []
for subcat in categories:
    subcats_tmp.extend([cat[:-1] for cat in subcat])

HBox(children=(IntProgress(value=0, description='Extract categories', max=262, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=287, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=264, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=278, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=268, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=261, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=249, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=280, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=265, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=286, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=284, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=287, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=289, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=248, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=286, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=289, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=286, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=289, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=278, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=279, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=286, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=268, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=264, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=268, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=261, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=278, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=279, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=264, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=264, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=264, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=280, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=261, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=287, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=287, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=264, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=249, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=264, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=268, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=285, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=278, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=280, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=268, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=286, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=278, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=284, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=268, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=283, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=287, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=278, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=308, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=286, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=264, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=264, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=285, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Extract categories', max=268, style=ProgressStyle(description…



In [59]:
considered_groups = ['politicians', 'athletes', 'actors', 'musicians', 'celebrities']
summary, summary_in_html = download_category(*considered_groups, min_no_of_users=5000)

HBox(children=(IntProgress(value=0, description='Getting category users', max=5, style=ProgressStyle(descripti…

HBox(children=(IntProgress(value=0, description='Downloding page of politicians', max=250, style=ProgressStyle…

HBox(children=(IntProgress(value=0, description='Extracting usernames', max=41, style=ProgressStyle(descriptio…

HBox(children=(IntProgress(value=0, description='Downloding page of athletes', max=250, style=ProgressStyle(de…

HBox(children=(IntProgress(value=0, description='Extracting usernames', max=200, style=ProgressStyle(descripti…

HBox(children=(IntProgress(value=0, description='Downloding page of actors', max=250, style=ProgressStyle(desc…

HBox(children=(IntProgress(value=0, description='Extracting usernames', max=156, style=ProgressStyle(descripti…

HBox(children=(IntProgress(value=0, description='Downloding page of musicians', max=250, style=ProgressStyle(d…

HBox(children=(IntProgress(value=0, description='Extracting usernames', max=200, style=ProgressStyle(descripti…

HBox(children=(IntProgress(value=0, description='Downloding page of celebrities', max=250, style=ProgressStyle…

HBox(children=(IntProgress(value=0, description='Extracting usernames', max=200, style=ProgressStyle(descripti…



'Summary'

In [61]:
HTML(summary_html(dict(sorted(summary.items(), key=lambda kv: len(kv[1]), reverse=True)), 'Summary'))

'Summary'

Category,Count
athletes,200
musicians,200
celebrities,200
actors,156
politicians,41


In [107]:
groups = {}
groups = {k: v for k, v in summary.items() if k in considered_groups}
display(HTML(summary_html(groups, 'Before deletion')))

# merger
groups['politicians'].extend(list(pd.read_csv('dataset/politicians.csv', header=0).screen_name))

# filtering duplicates and exporting to files
filtered_groups = {}
for group, users in groups.items():
    filtered_groups[group] = set(users) - set(chain(*[v for k, v in groups.items() if k is not group]))
    with open(f'dataset/nicknames/{group}.txt', mode='w') as f:
        f.write('\n'.join(filtered_groups[group]))
        
display(HTML(summary_html(filtered_groups, 'After deletion')))

'Before deletion'

Category,Count
politicians,6721
athletes,200
actors,156
musicians,200
celebrities,200


'After deletion'

Category,Count
politicians,702
athletes,163
actors,108
musicians,108
celebrities,45
