<h1>Ditchley S2DS project August 2020 - Code Pipeline<h1>
    <h2>Team: Adam Hawken, Luca Lamoni, Elizabeth Nicholson, Robert Webster<h2>

In [None]:
#![]() #graphical representation of the pipeline here

<h3>Section 0: Working directory and graph DB setup<h3>
    <h4>0.1: Modules and working directory setup<h4>

In [42]:
# Import modules and set up working directory
import sys
import os
import time
import logging
import json
import csv
import threading
import queue
import asyncio 
import nest_asyncio
nest_asyncio.apply()
import twint
import pandas as pd


# Set up working directory
# The working directory should reflect the structure of the Github repository https://github.com/S2DSLondon/Aug20_Ditchley
sys.path.insert(1, 'C:/Users/Luca/Aug20_Ditchley/')
from src.data import pipeline_setup
pipeline_setup.build_data_dir('C:/Users/Luca/Aug20_Ditchley/')

Data directory & sub-directories already exist, skipping.


<h4>0.2: Initialize graph database<h4> 

In [None]:
# import standard libraries
import numpy as np
import pandas as pd
from py2neo import Graph
from py2neo.data import Node, Relationship
from src.data import graphdb as gdb

# load / declare the database
graph = gdb.get_graph(new_graph = True)
graph
# start with an empty graph
graph.delete_all()

<h3>Section 1: Getting journalist twitter handles according to a keyword<h3>
    <h4>The journalist scraping is performed at the web address https://www.journalism.co.uk/prof/?chunk=0&cmd=default<h4>

In [2]:
# Choose keyword and run the scraping function
from src.data import journalists as journos
keyword = 'cyber'
# Input: string / Output: list
journo_handles = journos.get_handles_by_keyword(keyword)
print(len(journo_handles))
journo_handles

12


['jennystrasburg',
 'dannsimmons',
 'LeoKelion',
 'gordoncorera',
 'joetidy',
 '_lucyingham',
 'dannyjpalmer',
 'SophiaFurber',
 'SCFGallagher',
 'MsHannahMurphy',
 'JesscaHaworth',
 'Ad_Nauseum74']

<h3>Section 2. Scrape user information and friend lists for each journalist in the list<h3>
    <h4>2.1: Scrape user information using the Twitter API<h4> 

In [None]:
#Load twitter API credentials and return a tweepy API instance
import json
import tweepy
from src.data import api_tweepy as api

# Input: path of json file with credentials / Output: tweepy.api.API
tw_api = api.connect_API('../src/data/twitter_credentials.json')

In [None]:
# Scrape user information using the API
from src.data import api_user_tools as api_tools
from src.data import data_cleanup as dc

# Input: tweepy.api.API,list / Output: list
api_users = api_tools.batch_request_user_info(tw_api,journo_handles)
# Input: list / Output: DataFrame
df_api = dc.populate_user_df(api_users)
# Check
df_api.head()

In [None]:
# Save the dataframe as csv
df_api.to_csv('../data/processed/'+keyword+'_user_profiles.csv', index = False)

<h4>2.2: Load user info into graph DB<h4>

In [None]:
# Neo4j import files need to be in a specific folder, however, the csv files saved above are in a different folder, to go around this problem on Windows machines it is
# possible to create a shortcut between the two folders

# lowd in user information
print('Loading in user information and drawing (Person) nodes')
fn_users = 'cybersecurity_user_profiles.csv'
gdb.load_users(fn_users ,graph)

<h4>2.3: Scrape user friend list using Twint<h4> 

In [None]:
# 
from src.data import twint_tools as tt

# define keyword arguments / 'n_retries' = max number of scrape attempts, 'suppress' = hide critical Twint warnings
kwargs = {'n_retries':5,
         'suppress':False}
# Multi threading function Input: _get_friends function, number of threads to distribute the queque, args and kwargs
tt.twint_in_queue(tt._get_friends, 6, journo_handles, args=('../data/raw/'+keyword+'_',), kwargs=kwargs)

In [None]:
# Concatenate all the individual lists into one dataframe with journalist and its friends
friends_csv = tt.join_friends_csv(journo_handles,keyword) # this function has a bug, the first friend name is 'username'

In [None]:
# Save the dataframe as csv
friends_csv.to_csv('../data/processed/'+keyword+'_journalist_friends.csv', index=False)

<h4>2.4: Load friend information into DB<h4> 

In [None]:
# load in friend information
print('Loading in friends info and drawing [FOLLOWS] edges')
fn_friends = 'cybersecurity_journalist_friends.csv'
gdb.load_friends(fn_friends,graph)

<h3>Section 3. Loop over selected journalists handles and scrape their tweets (3.1) and mentions (3.2) using Twint<h3>
    <h4>Section 3.1: Scrape tweets using Twint<h4> 

In [None]:
from src.data import twint_tools as tt
# define keyword arguments
kwargs = {'date_range':('2020-08-01 00:00:00', None),
         'n_retries':5,
         'suppress':False}
# multi threading
tt.twint_in_queue(tt._search_tweets_by_user, 3, journo_handles, args=('../data/raw/'+keyword+'_',), kwargs=kwargs)

In [None]:
# Joined all the individual csv into one dataframe
cyber_test = tt.join_tweet_csv(journo_handles, keyword)
# Check
cyber_test.head()

In [None]:
# Save dataframe as csv
cyber_test.to_csv('../data/processed/'+keyword+'_journalist_tweets_twint.csv', index=False)

<h4>Section 3.2: Extract mentions from Twint dataset<h4> 

In [None]:
from src.data import data_cleanup as dc
# from the twint dataset, extract mentions based on tweet id and save in a separate csv
mentions_twint  = dc.mentions_to_df(cyber_test)
# Check
mentions_twint.head()

In [None]:
# Save the dataframe
mentions_twint.to_csv('../data/processed/' + keyword + '_mentions_twint.csv',index=False)

<h3>Section 4. Loop over selected journalists handles and scrape their tweets (4.1) and mentions (4.2) using Twitter API<h3>
    <h4>Section 4.1: Scrape tweets using Twitter API<h4> 

In [None]:
import json
import tweepy
from src.data import api_tweepy as api
#Load twitter API credentials and return a tweepy API instance
tw_api = api.connect_API('../src/data/twitter_credentials.json')

In [None]:
from src.data.api_tweet_tools import request_user_timeline, batch_request_user_timeline
cyber_test_api = batch_request_user_timeline(tw_api, journo_handles, '../data/processed/',  n_tweets=3200)

# Check
cyber_test_api.head()

<h4>Section 4.2: Extract mentions from API tweets<h4> 

In [None]:
from src.data import data_cleanup as dc
# from the API dataset, extract mentions based on tweet id and save in a separate csv
mentions_api  = dc.mentions_to_df(cyber_test_api)
# Check
mentions_api.head()

In [None]:
mentions_api.to_csv('../data/processed/' + keyword + '_mentions_api.csv',index=False)

<h3>Section 5. Data cleaning and standardization/LDA<h3>
     <h4>Section 5.1: Clean and standardize Twint dataset<h4>

In [None]:
# Standardise the twint output 
from src.data import data_cleanup as dc

# Standardize Twint dataset for graph DB loading
standard_tweet_twint = dc.clean_twint_dataframe(cyber_test)
# Check
standard_tweet_twint.head()

In [None]:
# Save the dataframe
standard_tweet_twint.to_csv('../data/processed/' + keyword + '_standard_tweets_twint.csv',index=False)

<h4>Section 5.2: Clean and standardize API dataset<h4>

In [None]:
# Standardise the twint output 
from src.data import data_cleanup as dc

# Standardize API dataset for graph DB loading
standard_tweet_api = dc.clean_API_dataframe(cyber_test_api)
# Check
standard_tweet_api.head()

In [None]:
# Save the dataframe
standard_tweet_api.to_csv('../data/processed/' + keyword + '_standard_tweets_api.csv',index=False)

<h3>Section 6. Create graph database and import twitter data into it<h3>
    <h4>Section 6.1: Import modules and load graph database<h4> 

In [None]:
# import standard libraries
import numpy as np
import pandas as pd
from py2neo import Graph
from py2neo.data import Node, Relationship
from src.data import graphdb as gdb

# load / declare the database
graph = gdb.get_graph(new_graph = True)
graph

<h4>Section 6.2: Load user info into graph DB<h4>

In [41]:
# Neo4j import files need to be in a specific folder, however, the csv files saved above are in a different folder, to go around this problem on Windows machines it is
# possible to create a shortcut between the two folders

# lowd in user information
print('Loading in user information and drawing (Person) nodes')
fn_users = 'cybersecurity_user_profiles.csv'
gdb.load_users(fn_users ,graph)

Loading in user information and drawing (Person) nodes


NameError: name 'gdb' is not defined

<h4>Section 6.2: Load friend information into DB<h4> 

In [None]:
# load in friend information
print('Loading in friends info and drawing [FOLLOWS] edges')
fn_friends = 'cybersecurity_journalist_friends.csv'
gdb.load_friends(fn_friends,graph)

<h4>Section 6.3: Load tweet data into DB<h4> 

In [None]:
# load in tweet information from twint
print('Loading in tweets and drawing (Tweet) nodes')
fn_tweets = '/data/processed/cybersecurity_standard_tweets_twint.csv'
gdb.load_tweets(fn_tweets ,graph) 

In [None]:
# load in tweet information from API
print('Loading in tweets and drawing (Tweet) nodes')
fn_tweets = '/data/processed/cybersecurity_standard_tweets_api.csv'
gdb.load_tweets(fn_tweets ,graph) 

<h4>Section 6.4: Draw edges between users and their tweets<h4> 

In [None]:
# draw edges between users and their tweets
print('Drawing [POSTS] edges')
gdb.get_posts(graph)


<h4>Section 6.5: Load tweets' mentions<h4> 

In [None]:
# load in mentions information
print('Loading in mentions and drawing [MENTIONS] edges')
fn_mentions = 'cybersecurity_mentions_twint.csv'
gdb.load_mentions(fn_mentions,graph)

<h4>Section 6.6: Run page rank algorithm using [FOLLOWS] [MENTIONS] edges<h4> 

In [None]:
# run Page rank using follower and mention edges
print('running page rank')
nodelist = ['Person','Tweet']
edgelist = ['FOLLOWS','MENTIONS']
page_rank_friends_mentions = gdb.run_pagerank(nodelist,edgelist,graph)

<h4>Section 6.7: Get a weighted random sample from the journalists friends<h4> 

In [None]:
# get a weighted random sample of users
n_sample = 20
fields = ['rank']
exponents = [2]
sample = gdb.get_multiple_weighted_sample(page_rank_friends_mentions,n_sample,fields,exponents)

<h3>Section 7. Download friends user info<h3>
    <h4>7.1. Join journalists list with friends list ######## maybe this step is not necessary? What if we join dataframes of friends user lists and initial journalists?<h4>

In [None]:
# We do this step to dowload an equal number of tweets from all users in order to carry out the topic analysis on the same tweet sample
df_friends_handles = pd.read_csv('../data/processed/'+keyword+'_journalist_friends.csv')
df_user_list_extended = pd.concat([pd.DataFrame(df_friends_handles.screen_name.unique()),df_friends_handles.friend]).drop_duplicates().reset_index(drop=True)
df_user_list_extended.columns = ['user']
freinds = list(df_user_list_extended.user)
friends = df_user_list_extended.user.tolist()
#friends.remove('hoarsewisperer')

In [None]:
friends[140]

<h4>7.2. Download the user information for the new extended list of users<h4>

In [None]:
#Load twitter API credentials and return a tweepy API instance
import json
import tweepy
from src.data import api_tweepy as api

# Input: path of json file with credentials / Output: tweepy.api.API
tw_api = api.connect_API('../src/data/twitter_credentials.json')

# Scrape user information using the API
from src.data import api_user_tools as api_tools
from src.data import data_cleanup as dc
#from src.data.api_tweet_tools import request_user_timeline, batch_request_user_timeline
# Input: tweepy.api.API,list / Output: list
api_users_friends = api_tools.batch_request_user_info(tw_api, friends)
# Input: list / Output: DataFrame
df_api_user_friends = dc.populate_user_df(api_users_friends)
# Save the dataframe as csv
df_api_user_friends.to_csv('../data/processed/'+keyword+'_user_friends_profiles.csv', index = False)
# Check
df_api_user_friends.head()

<h3>Section 8. Subsample profiles based on follower/friends distribution<h3>

In [40]:
from src.graph import graphdb as g_db
g_db.get_chi2(df_users_profiles_metrics)

Unnamed: 0,user_id,screen_name,name,location,user_description,user_friends_n,user_followers_n,prof_created_at,favourites_count,verified,statuses_count,retweet_count,like_count,h-index_like&retweets,chi2
0,244169661,jennystrasburg,jenny strasburg,London,wsj reporter new mexican in london cyber crime...,2852,5583,2011-01-28 17:33:33,5002,True,8444,216,344,9,0.757568
2,21644992,dannsimmons,Dan Simmons,"London, England",bbc technology reporter on specialises in mobi...,295,12457,2009-02-23 10:53:15,834,True,2247,215,1019,8,0.590327
4,19883587,leokelion,Leo Kelion,London,technology desk editor of bbc news recently wo...,4774,11865,2009-02-01 23:40:21,3482,True,4161,1057,2415,9,1.350548
10,242355547,gordoncorera,Gordon Corera,,bbc security correspondent author russians amo...,1293,13706,2011-01-24 15:59:31,244,False,1792,4007,7260,38,0.139076
12,16527091,joetidy,Joe Tidy,DMs Open,cyber reporter for bbc news covering cyber sec...,2722,11002,2008-09-30 12:51:28,8447,True,13518,341,1134,16,0.629276
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18543,798873782211448833,notameadow,🌻 𝙼𝚎𝚊𝚍𝚘𝚠 0x1338 𝙴𝚕𝚕𝚒𝚜 🌻,"London, England",foil sealed for freshness of what i say here i...,1676,14213,2016-11-16 13:02:06,58596,False,44598,31,569,12,0.262938
18544,519989509,mozillasecurity,Mozilla Security,,official account of mozilla security,55,686,2012-03-10 01:36:23,23,False,68,348,472,10,3.488539
18545,804230797993443328,sudo_sudoka,Sudoka,The Matrix (Inside now),threat analyst bounty hunter ctf player securi...,151,385,2016-12-01 07:48:58,967,False,268,234,732,7,2.215415
18546,836272957,nordvpn,NordVPN,Privacy solutions,everyone deserves a secure private and unrestr...,3515,77283,2012-09-20 20:07:18,2634,True,19989,238,1195,25,1.576031


<h3>Section 9. Download 200 tweets for each user<h3>

In [None]:
list(df_user_list_extended.user[:10])

In [None]:
#Load twitter API credentials and return a tweepy API instance
import json
import tweepy
from src.data import api_tweepy as api

# Input: path of json file with credentials / Output: tweepy.api.API
tw_api = api.connect_API('../src/data/twitter_credentials.json')

from src.data.api_tweet_tools import request_user_timeline, batch_request_user_timeline
#batch_request_user_timeline(tw_api, list(df_user_list_extended.user[:100]), filepath = '../data/processed/', api_delay = .3, n_tweets=2)
batch_request_user_timeline(tw_api, friends, filepath = '../data/raw/users_tweets/', api_delay = 0.1, n_tweets=200)

<h3>Section 10. Topic modelling and H-index calculation<h3>
    <h4>10.1. Tweet cleaning and standardization<h4>

In [None]:
# Clean the data
from src.data import data_cleanup as dc
import os
from tqdm.auto import tqdm
import numpy as np
import pandas as pd

src_dir = '../data/raw/users_tweets/'
dest_dir = '../data/cleaned/users_tweets/'

if not os.path.exists(dest_dir):
    os.makedirs(dest_dir)

files = [file for file in os.listdir(src_dir) if os.path.isfile(os.path.join(src_dir, file))]

with tqdm(total=len(files), desc='Files') as pbar:
    for file in files:
        raw_df = pd.read_csv(os.path.join(src_dir, file), low_memory=False)
        cleaned_df = dc.clean_API_dataframe(raw_df)
        cleaned_df.to_csv(os.path.join(dest_dir, file), index=False)
        pbar.update(1)

<h4>10.2. Calculate H-Index for each user (excluding RT)<h4>

In [3]:
from src.data import H_Index_tools as h_tools
import os
from tqdm.auto import tqdm
import numpy as np
import pandas as pd

src_dir = '../data/cleaned/users_tweets/'
dest_dir = '../data/processed/'
h_tools.loop_csv_H_index(src_dir,dest_dir, keyword)

In [37]:
# Import journalist user profiles, friends user profiles and concatenate them
user_profiles = pd.read_csv('../data/processed/'+keyword+'_user_profiles.csv' )
user_friends_profiles = pd.read_csv('../data/processed/'+keyword+'_user_friends_profiles.csv' )
users_df = pd.concat([user_profiles,user_friends_profiles])

# Join the dataframe of all users (journalists + friends) with the sums of their likes, retweets and thier H-Index
 # Import
like_rt_count_users = pd.read_csv('../data/processed/'+keyword+'_like_rt_count_users.csv' )
h_index_users = pd.read_csv('../data/processed/'+keyword+'_h_index_users.csv' )
# Merge 1
df_user_profiles_metrics = pd.merge(users_df, like_rt_count_users, how='inner', on='screen_name')
# Merge 2
df_users_profiles_metrics = pd.merge(df_user_profiles_metrics, h_index_users, how='inner', on='screen_name')
# Drop duplicates (the journalists rows sometimes are repeated)
df_users_profiles_metrics.drop_duplicates(subset ='screen_name',keep = 'first', inplace = True)
# Save final df
df_users_profiles_metrics.to_csv('../data/processed/'+keyword+'_user_profiles_metrics.csv', index = False)

<h4>10.3. Topic modelling<h4>