# Code for scraping data from the Lumen database at Harvard University, using their API
## Final objective: To get details of twitter accounts against whom the Government of India took action between May 2019 and October 2021 

In [93]:
import requests
import time
import random
import textract
import validators
import os
import pandas as pd
from bs4 import BeautifulSoup

## Step 1
Getting links to notices of the government to twitter, by scraping first 5 search pages of the Lumen Database

In [53]:
url = 'https://www.lumendatabase.org/notices/search?utf8=%E2%9C%93&\
date_received_facet=1602734400000.0..1634270400000.2&term=twitter+government+india'

'https://www.lumendatabase.org/notices/search?\
date_received_facet=1602734400000.0..1634270400000.0&page=2&term=twitter+government+india'
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) \
    AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',
}
extension_urls = []
for i in range (1,5):
    url = 'https://www.lumendatabase.org/notices/search?\
    date_received_facet=1602734400000.0..1634270400000.0&page='+str(i)+'&term=twitter+government+india'
    
    time.sleep(random.uniform(1, 5))
    reqs = requests.get(url,headers=headers)
    soup = BeautifulSoup(reqs.text, 'html.parser')


    for link in soup.find_all('a'):
        a=(link.get('href'))
        if 'notices' in a and 'search' not in a:
            extension_urls.append(a)
         
        
         
    

## Step 2
From the urls of notices acquired above, using html tags, and other key words to access specific urls which can be used to download documents which actually contain the details of the accounts

In [55]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',
    "X-Authentication-Token":"auth_token"}
download_urls = []

base_url_1 = 'https://www.lumendatabase.org'
base_url_2 = '.html?authentication_token='+auth_token
for ext in extension_urls:
    url = base_url_1 + str(ext) + base_url_2
    time.sleep(random.uniform(1, 5))
    r = requests.get(url,'html.parser',headers=headers)
    soup= BeautifulSoup(r.text, 'html.parser')
    
   
    for link in soup.find_all('a'):
        a=(link.get('href'))
        if 'file_uploads'in a:
            download_urls.append(base_url_1+str(a))
         


## Step 3
Accessing dates of each of the pdfs by accessing Epoch dates in theor urls and converting to regular YYYY-MM-DD format

In [113]:
dates = []
for url in download_urls:
    t = int(url[-10:])
  
    dates.append(time.strftime("%Y-%m-%d ", time.gmtime(t)))

## Step 4
Cleaning the dates accessed above by stripping the extra white space at the end of the dates

In [114]:
dates_cleaned= []
for date in dates:
    date=str(date).rstrip()
    dates_cleaned.append(date)
    

## Step 5
Now that we have the urls for each of the documents with account details, we use a curl command along with the required arguments using bash and subprocess in python, to download the documents,and then save them their corresponding dates as the filenames.

In [164]:
import subprocess
n=len(download_urls)
for i in range(n):
    bash = ['curl', '-H',"User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36",
            '-H',"X-Authentication-Token:auth_token",
            download_urls[i], '-o', str(dates_cleaned[i])+'.pdf']
                          
    subprocess.call(bash)               


## Step 6
Once we have all the downloaded documents, we write a script in Python to extract text from these documents, specifically urls of the twitter accounts that the government took action against.


In [172]:
accounts = {}


directory = "/Users/pranathiiyer/Desktop/Comp Analysis of Social " \
            "Processes/Archive/"
for filename in os.listdir(directory):
    try:

        text = textract.process(
            "/Users/pranathiiyer/Desktop/Comp Analysis of Social "
            "Processes/Archive/" + str(filename))
        text = (str(text).split('List of Content Actioned in Jurisdiction')[1]
            .split('Requested')[0])
        text = text.split(r'\n')

        for element in text:
            valid = validators.url(element)
            if valid:
                if filename in accounts:
                    accounts[filename].append(element)
                else:
                    accounts[filename.replace('.pdf',"")]=[element]
    except:

        print('invalid file')
        



invalid file
invalid file


## Step 7
Finally we load the account details along with the dates that action was requested against them, in a dataframe.

In [170]:
cols = ['Date', 'Accounts whose contents were actioned']
df = pd.DataFrame(columns=cols)
df['Date']=accounts.keys()
df['Accounts whose contents were actioned']=(accounts.values())
df.style.set_properties(subset=['Accounts whose contents were actioned'], **{'width': '500px'})


Unnamed: 0,Date,Accounts whose contents were actioned
0,2019-08-27,"('https://twitter.com/syedAliGelani_/status/1160881631474978817',)"
1,2021-01-29,"('https://twitter.com/NainaMughal22',)"
2,2021-04-24,"('https://twitter.com/TSP_JALE/status/1384754443393011719',)"
3,2019-08-22,"('https://twitter.com/Yasir_Musafir',)"
4,2021-03-29,"('https://twitter.com/HarcharanBajwa2',)"
5,2021-08-20,"('https://twitter.com/faizan0008/status/1378768683061760002',)"
6,2019-08-23,"('https://twitter.com/kashmir787',)"
7,2019-05-10,"('https://twitter.com/aism_e',)"
8,2019-07-03,"('https://MundaKoshur.com',)"
9,2019-07-17,"('https://twitter.com/tankarvind/status/1147715749038792704',)"
