## Camille Hansen 
# A NETWORK MODELLING BASED SOCIAL BOT DETECTION   


# Set Up 

In juypter Notebook bash commands can be run directly in the notebook by using '!'. 
I used this to install Tweepy and Pandas and python-igraph  using "!pip install".

In [None]:
### Import Packages
import tweepy 
import pandas as pd
import pickle 
#import twitter  #used for trouble shooting - ultimately tweepy was prefered for the twitter API 
import numpy as np
import nextworkx
import igraph

In [None]:
### Twitter Developer Credentials 
# actual keys hidden for privacy and accountability reasons
consumer_key = "consumer_key"
consumer_secret = "consumer_secret_key"
access_token = "access_token"
access_token_secret = "access_token_secret"

In [None]:
### Tweepy Authorisation 
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api_tweepy = tweepy.API(auth, wait_on_rate_limit = True, wait_on_rate_limit_notify = True)

# Data Mining 

## Mining Root User
Get Twitter Data about the choosen root user 

In [None]:
#reading from the fakedataset check the user name for the user is current using the users id (a unique value twitter creates that won't change over time or be reused by new accounts)
u = api_tweepy.get_user(id=123456) 
print(u) #read the screen_name from this infomation (it will give the current screen name of the user)

In [None]:
# Assign the root user by entering their screen_name as a variable
screen_name = "screen_name" 

In [None]:
####Only need to Run Once - NOT RUN WHEN STARTING A NEW SESSION 

# Using tweepy's twitter api to mine the users twitter data as an object class of the User 
user = api_tweepy.get_user(screen_name)
print(user)

## Mining Friends of Root User 
Forming a list of the root users friends and storing it as a pickled file.
I need to store the data as the CPU and time heavy data mining process is not something I want to repeat. 
I choose pickle over CSV because pickle is a serialized way of storing a Pandas dataframe meaning I am writing down the exact representation of the dataframe to disk. CSV stores a comma separated list meaning some information may be lost when you loading it in. 

In [None]:
####Only need to Run Once - NOT RUN WHEN STARTING A NEW SESSION 

#Creating an empty list that will be filled with the root users friends 
rootUser = []

#Mining the root users friends 
for friend in tweepy.Cursor(api_tweepy.friends_ids, screen_name).items():
    rootUser.append(friend)

#The Friend List is structured in a pandas dataframe and stored externally in a pickled file
root_user_df = pd.DataFrame({'screen_name':rootUser})
root_user_df.to_pickle('screen_name_rootUser_df.pkl')

### Output 
A list of the root user's friends in a data frame   

In [None]:
#### When Starting a New Session - RUN IN CONSECUTIVE SESSIONS - read pre-generated pkl files 

#Read The Pickled File 
root_user_df = pd.read_pickle('screen_name_rootUser_df.pkl')
root_user_df

## Mining Friends of Friends
Mining the data needed for a **1-step Neighbourhood** by extending the friends dataframe to include the friends of freinds that appear in the root users friend list. 

This data frame is shorter than the above one as some users were ristricted and had to be removed as their data could not be minned.  

In [None]:
####Only need to Run till completion of fof data mining - NOT RUN WHEN STARTING A NEW SESSION 

#Make the friend list dataframe a list 
user_List = root_user_df['screen_name'].values.tolist()
#Copy this list to iterate through and edit for mining 
user_Active = user_List

#empty freinds of friends dataframe 
fof_df = pd.DataFrame()
    
while len(user_Active)>0: 
    for friend_id in user_Active:
        
        try: 
            friends_list = []  #empty list to add to 
            col_name = str(friend_id)    #naming the saved files to disk with variable names 
            
            for fof in tweepy.Cursor(api_tweepy.friends_ids, id = friend_id).items(): #get friend of friend IDs using tweepy
                for i in user_List: #only including friend_IDs that are from the neighbourhood
                    if fof == i:
                        friends_list.append(fof)

            while len(friends_list) < len(user_List): #padding shorter lists to prevent value error with the number of rows from root_user_df
                friends_list.extend([''])
                
            fof_df[friend_id] = friends_list #add the list to the data frame with the column heading friend_id
            
            fof_df.to_pickle(col_name + 'screen_name_fof_df.pkl') #all pickled files before the most recent become redundant (and are deleted manually) but performing this for each column protects against unexpected system crashes
           # fof_df.to_csv(col_name + 'fof_df.csv')    My way of checking it was working like expected without interrupting the program

            friends_list.clear() 
            user_Active.remove(friend_id)
            
        except tweepy.TweepError as e:  #skipping over accounts that are restricted and documenting when this error occurs 
            print(e)
            print('problem at %s'%len(user_Active))
            print('Account %s'%friend_id)
            user_Active.pop(0)
        
    print(len(user_Active))   #counting down accounts left - this is imporant as this process took over 6 days to run 

In [None]:
fof_df_final = pd.read_pickle('final_fof_df.pkl') #the final fof dataframe 
fof_df_final.to_pickle('screen_name_fof_df_final.pkl')  #renameing it for ease of documentation 

### Output 
A dataframe of the friends of friends  

In [None]:
#### When Starting a New Session - RUN IN CONSECUTIVE SESSIONS - read pre-generated pkl files 

fof_df_final = pd.read_pickle('screen_name_fof_df_final.pkl') 
fof_df_final #showing the pandas dataframe of screenuser friends of friends 

# Creating 1-Step-Neighbourhood

### Removing Restricted Accounts 

In [None]:
#removing restricted accounts from the friends list and making the root users friend list only include the accounts that could have their friends of friends retrieved
user_List = root_user_df['screen_name'].values.tolist()
allowed_friends = list(fof_df_final.columns.values) #making column headings a list (these are the accounts that were not restricted)
updated_user = list(set(user_List).intersection(set(allowed_friends))) #updating user_List with the accounts that appear in both lists
updated_user_df = pd.DataFrame({'screen_name':updated_user}) #updating root user without restricted accounts 
updated_user_df #show to verify 

### Forming the Complete DataFrame 

In [None]:
#combine data frames 
df_complete = updated_user_df.join(fof_df_final) #combine root user and fof dataframes 
df_complete = df_complete.fillna(value=0) #remove any NaN values with 0 - to support list iteration later on 
df_complete.to_pickle('screen_name_df_complete.pkl') #store to call upon later 
df_complete.to_csv('screen_name_df_complete.csv') # store to manually observe on personal device 
df_complete #show to verify 

### Output 
A dataframe of the entire 1-step Twitter Neighbourhood

In [None]:
#### When Starting a New Session - RUN IN CONSECUTIVE SESSIONS - read pre-generated pkl files 

df_complete  = pd.read_pickle('screen_name_df_complete.pkl') 
df_complete #showing the pandas dataframe of screenuser friends of friends and screenuser friends

# Transforming Dataframe to an Edgelist 
Graphing was completed in Gephi. 
Gephi is a network graphing software that creates visualisations based on inputs such as an edgelist with the headings 'source', 'target' and 'weight'. 

In [None]:
# a function to great edgeweights based on the number of common connections and if connections are reciprocal
#this defines a common friend as 1 increase to the weight value but if their relationship is reciprocated each friend becomes exponentially more important
#these weight were not implimented in Gephi graphs as degree (number of connections) created a similar statistical definition and extreme  variations in weight created less succinct community modelling 
#improving the weight attribute will be the next step in improving this method of bot detection and is discussed in detail in the associated report 
def get_weight(col, value): 
    b = [] #empty list that will be used to append friends of user b 
    a = [] #empty list that will be used to append friends of user a
    w = 1 #set default weight to 1
    if col != 'screen_name': 
         col = int(col) #format string number value as int number value (consistent with the format of the df_complete dataframe)
    if value != 'screen_name' and value != '':
          value = int(value) #format string number value as int number value (consistent format of the df_complete dataframe)
    a = df_complete[col].values.tolist() #let a be the friends of user a
    a_list = [ele for ele in a if ele !='' ] #let a_list be the friends of user a if they are not '' (a blank padding value)
    b_list = []
    if value != '':
        try: 
            b = df_complete[value].values.tolist()  #let a be the friends of user b
            b_list = [ele for ele in b if ele !='' ] #let b_list be the friends of user b if they are not '' (a blank padding value)
        except KeyError as e:  
            print(e)
    if col in b_list: #if relationship is reciprocated 
        w = 3*(len(np.intersect1d(a,b))) #weight = exponential of number of common freinds 
    else:  #if not reciprocated 
        w = 2*len(np.intersect1d(a,b)) #weight = number of common freinds
        if w == 0: #remove 0 weights (upsets graphing)
            w = 1
    return w

In [None]:
df_edgelist = pd.DataFrame(columns = ['source', 'target', 'weight']) #empty dataframe with headings as defined
w = 1 
for col in df_complete:
    for value in df_complete[col].tolist(): 
            if value != "": #removing padding 
                if value != 0: #removing NaN values
                    col = str(col) #consistent with format of the edgelist desired by Gephi 
                    value = str(value)#consistent with format of the edgelist desired by Gephi 
                    new_row = {'source': col, 'target': value, 'weight': get_weight(col,value)} #append row calling the get_weight function to determine the weight
                    df_edgelist = df_edgelist.append(new_row, ignore_index = True) #cycling through to add each row
            
df_edgelist #visually verify 
df_edgelist.to_csv('screen_name_EdgeList.csv') #export to csv for Gephi 
df_edgelist.to_pickle('screen_name_Edgelist.pkl') #store as pickle to reread in python for networkx and igraph modelling

### Output 
A Edgelist data frame of the source and target node with a generated weight 

In [None]:
#### When Starting a New Session - RUN IN CONSECUTIVE SESSIONS - read pre-generated pkl files 

df_complete  = pd.read_pickle('screen_name_Edgelist.pkl') 
df_edgelist

This edgelist was then transfered to Gephi for network modelling. 

# Networkx and IGraph verification of statitical findings sourced from Gephi
### Statistical Graph Values exploited for Bot Detection 
Through studying the graph in gephi I identifid a trend between spambots, socialbots and genuine users.
The trend appeared in the Average Degree attribute and Clustering Triangles attribute. 
Here I verify those numbers in IGraph and Networkx to validate the findings. 
#### These values where identical to those in Gephi verifying the findings across modelling softwares. This is unsuprising given average degree and number of triangles are not based on the modelling position of the graph, but rather the result of a statistical calculation, but still worth testing across modelling softwares to further validate the results. 

In [None]:
with open('screen_name_Edgelist2.txt', 'rb') as file:
    gx_dir = nx.read_edgelist(file, create_using=nx.DiGraph(), data=(('weight',int),), delimiter=' ')
with open('screen_name_Edgelist2.txt', 'rb') as file: 
    gx_und = nx.read_edgelist(file, create_using=nx.Graph(), data=(('weight',int),), delimiter=' ')
with open('screen_name_Edgelist2.txt', 'r', encoding = 'utf-8') as input_file:
    ig_dir = igraph.Graph.Read_Ncol(input_file, names=True, directed = True, weights = True)
with open('screen_name_Edgelist2.txt', 'r', encoding = 'utf-8') as input_file:
    ig_und = igraph.Graph.Read_Ncol(input_file, names=True, directed = False, weights = True)

In [None]:
def triangles(g):
    cliques = g.cliques(min=3, max=3)
    result = [0] * g.vcount()
    for i, j, k in cliques:
        result[i] += 1
        result[j] += 1
        result[k] += 1
    return result

### Output 
Networkx driven caluclations of number of nodes, number of edges, number of triangles and average in-degree and out-degree 

In [None]:
#Networkx Attributes 
gx_dir.number_of_nodes()
gx_dir.number_of_edges()
gx_dir.degree()
number_of_triangles = sum(nx.triangles(gx_und).values()) / 3 #triangles can only be calculated in an undirected graph
print('\n screen_name:')
print(nx.info(gx_dir)) 
print('Total Triangles:  %.2f'  %number_of_triangles )

### Output 
IGraph driven caluclations of number of nodes, number of edges, number of triangles and average degree 

In [None]:
#IGraph Attributes 
print('\n screen_name:')
summary(ig_dir)
degin = ig_dir.indegree()
avg_degin = sum(degin)/len(degin)
degout = ig_dir.indegree()
avg_degout = sum(degout)/len(degout)
#ig_und.transitivity_avglocal_undirected() #average clustering coefficent undirected 
num_tri = triangles(ig_und)
total_triangles = sum(num_tri)/3 
print('Total Triangles:  %.2f'  %total_triangles )
print('Average in degree::  %.2f'  %avg_degin )
print('Average out degree::  %.2f'  %avg_degout )