<div align="center"><h1><b>Step 3: Format and Generate Additional Features on Datasets</b></h1></div>

**Outline**

1. Format legitimate user data to prep for machine learning models by one hot encoding and creating numerical features
2. Calculate additional metrics for each user from their tweet dataset to quantify pattern of life
3. Perform the same steps for the Russian disinformation dataset
4. Merge datasets and save as inputs to Step 4

In [None]:
# Import necessary libraries

# For accessing Google Drive Files
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth, drive
from oauth2client.client import GoogleCredentials

# Connect and authenticate Google Drive with Google CoLab
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive.mount('/drive')
drive = GoogleDrive(gauth)

import CS3315Project.tweetProcessing as tweetProcessing
import pandas as pd
import numpy as np 
import re
import json
from json import JSONDecodeError

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


There are some minor formatting differences between the two datasets still, so we will format the legitimate and disinformation datasets separately to get them into the same format.

In [None]:
# Import and format Legitimate datasets

leguserfile = drive.CreateFile({'id': 'insert file id'}) 
leguserfile.GetContentFile('all_leg_users_final.csv')

legtweetfile = drive.CreateFile({'id': 'insert file id'}) 
legtweetfile.GetContentFile('all_leg_tweets_final.csv')


leg_users = pd.read_csv('all_leg_users_final.csv')
leg_tweets = pd.read_csv('all_leg_tweets_final.csv')

  interactivity=interactivity, compiler=compiler, result=result)


Now we will preprocess each column for the users and tweets into formats that can be understood by a model, and drop the ones that are not useful.

In [None]:
# Evaluate each column in the user and tweet dataframes to determine how to change them

leg_users.info(), leg_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4427 entries, 0 to 4426
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   userid                    4427 non-null   int64  
 1   user_display_name         4427 non-null   object 
 2   user_screen_name          4427 non-null   object 
 3   user_reported_location    2953 non-null   object 
 4   user_profile_description  3841 non-null   object 
 5   user_profile_url          1284 non-null   object 
 6   follower_count            4427 non-null   float64
 7   following_count           4427 non-null   float64
 8   account_creation_date     4427 non-null   object 
 9   account_language          0 non-null      float64
 10  BoW                       4427 non-null   object 
dtypes: float64(3), int64(1), object(7)
memory usage: 380.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11231672 entries, 0 to 11231671
Data columns (total 31 columns):

(None, None)

In [None]:
# Drop the display name, screen name, and the user_profile_url since we can use the userid to identify users
#  and the url will not be useful for the model. Drop the account language, since we will be recalculating this column
# Drop the account creation date, since we will not be using it in the model
# In the tweet dataframe, we will not require any of these columns for our analysis or they are redundant to the user dataframe, or they 
# are all null

leg_users = leg_users.drop(['user_display_name', 'user_screen_name','account_creation_date', 'user_profile_url', 'account_language'], axis=1)

leg_tweets = leg_tweets.drop(['user_display_name', 'user_screen_name', 'user_reported_location', 'user_profile_description', 'follower_count', 'following_count', 'user_profile_url', 'account_language', 'tweet_client_name', 'account_creation_date', 'in_reply_to_userid', 'in_reply_to_tweetid', 'quoted_tweet_tweetid','retweet_userid','retweet_tweetid','latitude', 'longitude', 'tweet_text', 'reply_count', 'BoW'], axis=1)

leg_users.info(), leg_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4427 entries, 0 to 4426
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   userid                    4427 non-null   int64  
 1   user_reported_location    2953 non-null   object 
 2   user_profile_description  3841 non-null   object 
 3   follower_count            4427 non-null   float64
 4   following_count           4427 non-null   float64
 5   BoW                       4427 non-null   object 
dtypes: float64(2), int64(1), object(3)
memory usage: 207.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11231672 entries, 0 to 11231671
Data columns (total 12 columns):
 #   Column          Dtype  
---  ------          -----  
 0   tweetid         int64  
 1   userid          int64  
 2   tweet_language  object 
 3   tweet_time      object 
 4   is_retweet      object 
 5   quote_count     float64
 6   like_count      float64
 7   retweet_c

(None, None)

In [None]:
# One hot encode user_reported_location to indicate whether or not a user reports their location

leg_users.loc[:,'user_reported_location'] = pd.notnull(leg_users['user_reported_location'])
leg_users.loc[:,'user_reported_location'] = leg_users.loc[:,'user_reported_location'].astype('int') 

leg_users.head()

Unnamed: 0,userid,user_reported_location,user_profile_description,follower_count,following_count,BoW
0,81914168,1,Cranky Antifa Yenta. #Bernie2020 She/Her Seita...,2654.0,2026.0,"['rose', 'scent', 'soap', 'antifascistf', 'luk..."
1,1209338232946614272,1,"Investigator who understands the Law (4,5,6,8,...",0.0,84.0,"['look', 'like', 'wrong', 'curtain', 'leave', ..."
2,1003903106730680320,0,I only play video games for that hot gamer score,0.0,15.0,"['championforaca', 'mcdoozie', 'sirelkmn', 'vi..."
3,1269876357472772097,1,✩Taylor Swift stan since birth ✩One Direction ...,85.0,465.0,"['better', 'lmao', 'friend', 'like', 'joke', '..."
4,183036128,1,@PHitchener9 and @alicialoxley present Melbour...,159854.0,2312.0,"['international', 'flight', 'arrive', 'melbour..."


In [None]:
# One hot encode user_profile_description to indicate whether or not a user has a profile description

leg_users.loc[:,'user_profile_description'] = pd.notnull(leg_users['user_profile_description'])
leg_users.loc[:,'user_profile_description'] = leg_users.loc[:,'user_profile_description'].astype('int')

leg_users.head()

Unnamed: 0,userid,user_reported_location,user_profile_description,follower_count,following_count,BoW
0,81914168,1,1,2654.0,2026.0,"['rose', 'scent', 'soap', 'antifascistf', 'luk..."
1,1209338232946614272,1,1,0.0,84.0,"['look', 'like', 'wrong', 'curtain', 'leave', ..."
2,1003903106730680320,0,1,0.0,15.0,"['championforaca', 'mcdoozie', 'sirelkmn', 'vi..."
3,1269876357472772097,1,1,85.0,465.0,"['better', 'lmao', 'friend', 'like', 'joke', '..."
4,183036128,1,1,159854.0,2312.0,"['international', 'flight', 'arrive', 'melbour..."


In [None]:
# One-hot encode is_retweet from boolean to int value, null values will be set as false

leg_tweets.loc[:, 'is_retweet'] = leg_tweets.loc[:,'is_retweet'].fillna(False)

leg_tweets.loc[:,'is_retweet'] = leg_tweets.loc[:,'is_retweet'].astype('int')

leg_tweets.head()

Unnamed: 0,tweetid,userid,tweet_language,tweet_time,is_retweet,quote_count,like_count,retweet_count,hashtags,urls,user_mentions,BoW
0,1334343130880073729,81914168,en,2020-12-03 03:46:07,0,0.0,0.0,0.0,[],[],"[{'screen_name': 'expectlettuce', 'name': 'Psy...","['rose', 'scent', 'soap']"
1,1334342111454515202,81914168,en,2020-12-03 03:42:04,1,0.0,1.0,0.0,[],[],"[{'screen_name': 'TimmyBofficial', 'name': 'Ti...","['antifascistf', 'lukeobrien', 'luke', 'sock',..."
2,1334327421684867074,81914168,en,2020-12-03 02:43:42,1,1.0,0.0,1.0,[],[],"[{'screen_name': 'NBC29', 'name': 'NBC29', 'id...","['staunton', 'woman', 'face', 'year', 'probati..."
3,1334327074627203072,81914168,en,2020-12-03 02:42:19,1,7.0,0.0,7.0,[],[],"[{'screen_name': 'PrisonReformMvt', 'name': 'P...","['cops', 'prevent', 'crime']"
4,1334326968305717250,81914168,und,2020-12-03 02:41:54,0,0.0,0.0,0.0,[],[],"[{'screen_name': 'Percussioner211', 'name': 'M...",[]


In [None]:
# Convert hashtags to number of hashtags used

leg_tweets.loc[:,'hashtags'] = leg_tweets.loc[:,'hashtags'].fillna('[]')

leg_hashtags = leg_tweets['hashtags']
hashtag_count = []


for i in range(len(leg_hashtags)):
  count = 0
  hashtags = leg_hashtags.iloc[i]
  try:
    hashtags = re.sub('\'', '\"', hashtags)
    hashtags = json.loads(hashtags)
    for item in hashtags:
      count += 1
    hashtag_count.append(count)
  except TypeError:
    hashtag_count.append(count)

leg_tweets.loc[:,'hashtags'] = hashtag_count

leg_tweets.head()

Unnamed: 0,tweetid,userid,tweet_language,tweet_time,is_retweet,quote_count,like_count,retweet_count,hashtags,urls,user_mentions,BoW
0,1334343130880073729,81914168,en,2020-12-03 03:46:07,0,0.0,0.0,0.0,0,[],"[{'screen_name': 'expectlettuce', 'name': 'Psy...","['rose', 'scent', 'soap']"
1,1334342111454515202,81914168,en,2020-12-03 03:42:04,1,0.0,1.0,0.0,0,[],"[{'screen_name': 'TimmyBofficial', 'name': 'Ti...","['antifascistf', 'lukeobrien', 'luke', 'sock',..."
2,1334327421684867074,81914168,en,2020-12-03 02:43:42,1,1.0,0.0,1.0,0,[],"[{'screen_name': 'NBC29', 'name': 'NBC29', 'id...","['staunton', 'woman', 'face', 'year', 'probati..."
3,1334327074627203072,81914168,en,2020-12-03 02:42:19,1,7.0,0.0,7.0,0,[],"[{'screen_name': 'PrisonReformMvt', 'name': 'P...","['cops', 'prevent', 'crime']"
4,1334326968305717250,81914168,und,2020-12-03 02:41:54,0,0.0,0.0,0.0,0,[],"[{'screen_name': 'Percussioner211', 'name': 'M...",[]


In [None]:
# Convert urls to number of urls used

leg_tweets.loc[:,'urls'] = leg_tweets.loc[:,'urls'].fillna('[]')

leg_urls = leg_tweets['urls']
url_count = []


for i in range(len(leg_urls)):
  count = 0
  urls = leg_urls.iloc[i]
  try:
    urls = re.sub('\'', '\"', urls)
    urls = json.loads(urls)
    for item in urls:
      count += 1
    url_count.append(count)
  except TypeError:
    url_count.append(count)
  except JSONDecodeError:
    count = urls.count(r'"url"')
    url_count.append(count)

leg_tweets.loc[:,'urls'] = url_count

leg_tweets.head()

Unnamed: 0,tweetid,userid,tweet_language,tweet_time,is_retweet,quote_count,like_count,retweet_count,hashtags,urls,user_mentions,BoW
0,1334343130880073729,81914168,en,2020-12-03 03:46:07,0,0.0,0.0,0.0,0,0,"[{'screen_name': 'expectlettuce', 'name': 'Psy...","['rose', 'scent', 'soap']"
1,1334342111454515202,81914168,en,2020-12-03 03:42:04,1,0.0,1.0,0.0,0,0,"[{'screen_name': 'TimmyBofficial', 'name': 'Ti...","['antifascistf', 'lukeobrien', 'luke', 'sock',..."
2,1334327421684867074,81914168,en,2020-12-03 02:43:42,1,1.0,0.0,1.0,0,0,"[{'screen_name': 'NBC29', 'name': 'NBC29', 'id...","['staunton', 'woman', 'face', 'year', 'probati..."
3,1334327074627203072,81914168,en,2020-12-03 02:42:19,1,7.0,0.0,7.0,0,0,"[{'screen_name': 'PrisonReformMvt', 'name': 'P...","['cops', 'prevent', 'crime']"
4,1334326968305717250,81914168,und,2020-12-03 02:41:54,0,0.0,0.0,0.0,0,0,"[{'screen_name': 'Percussioner211', 'name': 'M...",[]


In [None]:
# Convert user mentions to number of users mentioned

leg_tweets.loc[:,'user_mentions'] = leg_tweets.loc[:,'user_mentions'].fillna('[]')

leg_user_mentions = leg_tweets['user_mentions']
user_mentions_count = []


for i in range(len(leg_user_mentions)):
  count = 0
  user_mention = leg_user_mentions.iloc[i]
  try:
    user_mention = re.sub('\'', '\"', user_mention)
    user_mention = json.loads(user_mention)
    for item in user_mention:
      count += 1
    user_mentions_count.append(count)
  except TypeError:
    user_mentions_count.append(count)
  except JSONDecodeError:
    count = user_mention.count(r'"screen_name"')
    user_mentions_count.append(count)

leg_tweets.loc[:,'user_mentions'] = user_mentions_count

leg_tweets.head()

Unnamed: 0,tweetid,userid,tweet_language,tweet_time,is_retweet,quote_count,like_count,retweet_count,hashtags,urls,user_mentions,BoW
0,1334343130880073729,81914168,en,2020-12-03 03:46:07,0,0.0,0.0,0.0,0,0,1,"['rose', 'scent', 'soap']"
1,1334342111454515202,81914168,en,2020-12-03 03:42:04,1,0.0,1.0,0.0,0,0,4,"['antifascistf', 'lukeobrien', 'luke', 'sock',..."
2,1334327421684867074,81914168,en,2020-12-03 02:43:42,1,1.0,0.0,1.0,0,0,1,"['staunton', 'woman', 'face', 'year', 'probati..."
3,1334327074627203072,81914168,en,2020-12-03 02:42:19,1,7.0,0.0,7.0,0,0,1,"['cops', 'prevent', 'crime']"
4,1334326968305717250,81914168,und,2020-12-03 02:41:54,0,0.0,0.0,0.0,0,0,1,[]


In [None]:
# Fill NA and other anomalous values in tweet time with a dummy date to discard in function

leg_tweets.loc[:, 'tweet_time'] = leg_tweets.loc[:, 'tweet_time'].fillna('1900-01-01 00:00')

anomalous_values = leg_tweets[leg_tweets.loc[:, 'tweet_time'].str.endswith('.0')]
anomalous_values = anomalous_values['tweet_time']
for value in anomalous_values:
    leg_tweets.loc[:, 'tweet_time'] = leg_tweets.loc[:,'tweet_time'].replace(value,'1900-01-01 00:00')

leg_tweets.loc[:, 'tweet_time'] = pd.to_datetime(leg_tweets['tweet_time'], format='%Y-%m-%d %H:%M:%S')

In [None]:
# Calculate additional features for the legitimate dataset

# Calculate the ratio of retweets to original tweets
print('calculating retweet ratio')
tweetProcessing.retweetRatio(leg_users, leg_tweets)

# Calculate the ratio of english tweets
print('calculating english tweet ratio')
tweetProcessing.englishRatio(leg_users, leg_tweets) 

# Calculate time statistics
print('calculating time statistics')
tweetProcessing.tweet_time_statistics(leg_tweets, leg_users, en=True, non=True)

# Calculate tweet rate statistics 
print('calculating tweet rate statistics') 
tweetProcessing.averageTweetNum(leg_users, leg_tweets)

# Calculate tweet engagement metrics
print('calculating engagement metrics')
tweetProcessing.avgTweetMetrics(leg_users, leg_tweets)

# Label the legitimate dataset with binary classification indicating they are not part of an information operation
# This will be the feature to predict in the classification model

leg_users['info_op'] = 0
leg_tweets['info_op'] = 0

leg_users

calculating retweet ratio
calculating english tweet ratio
calculating time statistics
calculating tweet rate statistics
calculating engagement metrics


Unnamed: 0,userid,user_reported_location,user_profile_description,follower_count,following_count,BoW,retweet_ratio,english_tweet_proportion,earliest_tweet_time,latest_tweet_time,average_tweet_time,median_tweet_time,tweet_count,stddev_tweet_time,mode_0,mode_1,mode_2,mode_3,mode_4,mode_5,mode_6,mode_7,mode_8,mode_9,mode_10,mode_11,mode_12,mode_13,mode_14,mode_15,mode_16,mode_17,mode_18,mode_19,mode_20,mode_21,mode_22,mode_23,avg_tweets_per_week,avg_tweets_per_day,avg_tweets_per_hour,avg_tweets_per_min,avg_quote_count,avg_like_count,avg_retweet_count,avg_hashtags,avg_urls,avg_user_mentions,info_op
0,81914168,1,1,2654.0,2026.0,"['rose', 'scent', 'soap', 'antifascistf', 'luk...",0.603425,0.882136,0,2357,2049,1648,2978,727,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,595.600000,99.266667,4.284892,0.071487,571.801880,1.357623,571.801880,0.063130,0.140363,1.161518,0
1,1209338232946614272,1,1,0.0,84.0,"['look', 'like', 'wrong', 'curtain', 'leave', ...",0.363636,0.890909,6,2358,156,415,165,818,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.024390,0.583039,0.024369,0.000406,6088.878788,0.430303,6088.878788,0.054545,0.169697,1.030303,0
2,1003903106730680320,0,1,0.0,15.0,"['championforaca', 'mcdoozie', 'sirelkmn', 'vi...",0.027778,0.986111,4,2346,244,738,72,722,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.562500,0.080717,0.003367,0.000056,315.652778,0.347222,315.652778,0.180556,0.055556,2.041667,0
3,1269876357472772097,1,1,85.0,465.0,"['better', 'lmao', 'friend', 'like', 'joke', '...",0.027624,0.937845,0,2359,2,1523,724,822,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,36.200000,5.443609,0.228608,0.003812,3779.531768,1.493094,3779.531768,0.078729,0.029006,0.569061,0
4,183036128,1,1,159854.0,2312.0,"['international', 'flight', 'arrive', 'melbour...",0.152000,0.999333,0,2359,427,78,3000,653,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,428.571429,73.170732,3.112033,0.051902,7.644667,13.691667,7.644667,1.171333,0.365000,0.746667,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4422,1298520433549914118,1,1,363.0,398.0,"['believe', 'baby', 'boy', 'goodnight', 'jicho...",0.255775,0.751590,306,2056,942,948,2987,50,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,995.666667,213.357143,9.482540,0.158638,1134.311349,0.614664,1134.311349,0.263810,0.226314,0.463676,0
4423,1303048017377767424,0,1,163.0,111.0,"['ohhhh', 'brother', 'caesar', 'blockaisback',...",0.069806,0.884770,0,2358,2056,177,2994,714,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,427.714286,69.627907,2.915287,0.048620,156.807949,1.156647,156.807949,0.063460,0.104876,1.643955,0
4424,1314617757049196544,1,1,339.0,526.0,"['judicial', 'panel', 'scam', 'endsars', 'peac...",0.539443,0.830480,2,2358,1052,1117,2978,518,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,496.333333,80.486486,3.415138,0.056959,1337.967092,1.509066,1337.967092,0.104733,0.119839,1.706613,0
4425,1316519818674143232,1,1,334.0,210.0,"['article', 'father', 'educator', 'black', 'ma...",0.708009,0.948969,0,2359,211,640,2959,88,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,369.875000,61.645833,2.611650,0.043567,572.167962,0.269348,572.167962,0.343021,0.221697,1.068266,0


In [None]:
# Delete tweet language since no longer needed

leg_tweets = leg_tweets.drop(['tweet_language'], axis=1)

leg_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11231672 entries, 0 to 11231671
Data columns (total 11 columns):
 #   Column         Dtype         
---  ------         -----         
 0   tweetid        int64         
 1   userid         int64         
 2   tweet_time     datetime64[ns]
 3   is_retweet     int64         
 4   quote_count    float64       
 5   like_count     float64       
 6   retweet_count  float64       
 7   hashtags       int64         
 8   urls           int64         
 9   user_mentions  int64         
 10  info_op        int64         
dtypes: datetime64[ns](1), float64(3), int64(7)
memory usage: 942.6 MB


In [None]:
# Upon further inspection, I found some extra special character in each users BoW, so this code removes them

BoW_leg_list = leg_users['BoW'].str.replace("'",'')
BoW_leg_list = BoW_leg_list[BoW_leg_list.notnull()].apply(lambda x: re.sub('[^,a-zA-Z0-9]+', '', x))
BoW_leg_list = BoW_leg_list[BoW_leg_list.notnull()].apply(lambda x: x.strip('][').split(','))
BoW_leg_list = BoW_leg_list[BoW_leg_list.notnull()].apply(lambda x: list(filter(None, x)))
leg_users['BoW'] = BoW_leg_list

leg_users['BoW'].head()

0    [rose, scent, soap, antifascistf, lukeobrien, ...
1    [look, like, wrong, curtain, leave, gun, canno...
2    [championforaca, mcdoozie, sirelkmn, victoriaf...
3    [better, lmao, friend, like, joke, actually, v...
4    [international, flight, arrive, melbourne, mon...
Name: BoW, dtype: object

In [None]:
# Save legitimate dataset to csv then delete to make room for the disinformation tweets

leg_tweets.to_csv('/mypath/Step 3 - Feature Generation/Processed_Data_Step3/leg_tweets_processed.csv', index=False)
leg_users.to_csv('/mypath/Step 3 - Feature Generation/Processed_Data_Step3/leg_users_processed.csv', index=False)

In [None]:
del leg_tweets, leg_users

Now we will repeat this process to format the disinformation tweets

In [None]:
# Repeat formatting process for disinformation Twitter users

disuserfile = drive.CreateFile({'id': 'insert file id'}) 
disuserfile.GetContentFile('rus_users_bow.csv')

distweetfile1 = drive.CreateFile({'id': 'insert file id'}) 
distweetfile1.GetContentFile('rus_tweets_0119.csv')

distweetfile2 = drive.CreateFile({'id': 'insert file id'}) 
distweetfile2.GetContentFile('rus_tweets_0920.csv')

distweetfile3 = drive.CreateFile({'id': 'insert file id'}) 
distweetfile3.GetContentFile('rus_tweets_0520.csv')

distweetfile4 = drive.CreateFile({'id': 'insert file id'}) 
distweetfile4.GetContentFile('rus_tweets_1018.csv')

distweetfile5 = drive.CreateFile({'id': 'insert file id'}) 
distweetfile5.GetContentFile('rus_tweets_0619.csv')

dis_users = pd.read_csv('rus_users_bow.csv')
dis_tweets1 = pd.read_csv('rus_tweets_0119.csv')
dis_tweets2 = pd.read_csv('rus_tweets_0920.csv')
dis_tweets3 = pd.read_csv('rus_tweets_0520.csv')
dis_tweets4 = pd.read_csv('rus_tweets_1018.csv')
dis_tweets5 = pd.read_csv('rus_tweets_0619.csv')

dis_tweet_list = dis_tweets1, dis_tweets2, dis_tweets3, dis_tweets4, dis_tweets5

dis_tweets = pd.concat(dis_tweet_list, ignore_index=True)

del dis_tweets1, dis_tweets2, dis_tweets3, dis_tweets4, dis_tweets5, dis_tweet_list

# Drop unnamed column
dis_users = dis_users.drop('Unnamed: 0', axis=1)
dis_tweets = dis_tweets.drop('Unnamed: 0', axis=1)


# Drop dataset column for disinformation tweets and users
dis_users = dis_users.drop('dataset', axis=1)
dis_tweets = dis_tweets.drop('dataset', axis=1)

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
# Drop the same columns as in the legitimate datasets
# Drop the display name, screen name, and the user_profile_url since we can use the userid to identify users
#  and the url will not be useful for the model. Drop the account language, since we will be recalculating this column
# Drop the account creation date, since we will not be using it in the model
# In the tweet dataframe, we will not require any of these columns for our analysis or they are redundant to the user dataframe, or they 
# are all null

dis_users = dis_users.drop(['user_display_name', 'user_screen_name', 'account_creation_date', 'user_profile_url', 'account_language'], axis=1)

dis_tweets = dis_tweets.drop(['user_display_name', 'user_screen_name', 'user_reported_location', 'user_profile_description', 'follower_count', 'following_count', 'user_profile_url', 'account_language', 'account_creation_date', 'in_reply_to_userid', 'in_reply_to_tweetid', 'quoted_tweet_tweetid','retweet_userid','retweet_tweetid','latitude', 'longitude', 'tweet_text', 'reply_count', 'BoW'], axis=1)

dis_users.info(), dis_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4167 entries, 0 to 4166
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   userid                    4167 non-null   object 
 1   user_reported_location    3293 non-null   object 
 2   user_profile_description  2946 non-null   object 
 3   follower_count            4164 non-null   float64
 4   following_count           4165 non-null   float64
 5   BoW                       4167 non-null   object 
dtypes: float64(2), object(4)
memory usage: 195.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12102869 entries, 0 to 12102868
Data columns (total 12 columns):
 #   Column             Dtype  
---  ------             -----  
 0   tweetid            object 
 1   userid             object 
 2   tweet_language     object 
 3   tweet_time         object 
 4   tweet_client_name  object 
 5   is_retweet         object 
 6   quote_count        float

(None, None)

In [None]:
# One hot encode user_reported_location to indicate whether or not a user reports their location

dis_users.loc[:,'user_reported_location'] = pd.notnull(dis_users['user_reported_location'])
dis_users.loc[:,'user_reported_location'] = dis_users.loc[:,'user_reported_location'].astype('int') 

dis_users.head()

Unnamed: 0,userid,user_reported_location,user_profile_description,follower_count,following_count,BoW
0,CqW9bECdw2Jjk9DDU7UyE6P59TukYFISNE8J6sN66u4=,0,Associate editor & online editor at Peace Data...,691.0,1490.0,"['schumer', 'fight', 'hard', 'healthcare', 'br..."
1,uOrf1TDmM7vP4YEhOJDXORoqvpDlsJt03AyOfhrZo=,1,Peace Data is a global news organization. We c...,3152.0,4142.0,"['watersecurity', 'jordan', 'crucial', 'mainta..."
2,LXW4uuq2JWx4So6ycDFanp4qYQxNvj0ftiuyUe3tZo=,1,"An activist, an editor\nCommunications manager...",37.0,44.0,"['freedom', 'press', 'acknowledge', 'civilized..."
3,oqEFFiOrA+QVN8mEK0wweRTMmY2FQNB6XE5baB1Wik=,1,Associate editor @peacedata_,42.0,94.0,"['high', 'time', 'retire', 'white', 'racist', ..."
4,KZRG5Icfor+M92fyOxzu+dfY5whVco7VarRTmdSwf0=,1,,2.0,16.0,"['set', 'twitter', 'myfirsttweet', 'man', 'buy..."


In [None]:
# One hot encode user_profile_description to indicate whether or not a user has a profile description

dis_users.loc[:,'user_profile_description'] = pd.notnull(dis_users['user_profile_description'])
dis_users.loc[:,'user_profile_description'] = dis_users.loc[:,'user_profile_description'].astype('int')

dis_users.head()

Unnamed: 0,userid,user_reported_location,user_profile_description,follower_count,following_count,BoW
0,CqW9bECdw2Jjk9DDU7UyE6P59TukYFISNE8J6sN66u4=,0,1,691.0,1490.0,"['schumer', 'fight', 'hard', 'healthcare', 'br..."
1,uOrf1TDmM7vP4YEhOJDXORoqvpDlsJt03AyOfhrZo=,1,1,3152.0,4142.0,"['watersecurity', 'jordan', 'crucial', 'mainta..."
2,LXW4uuq2JWx4So6ycDFanp4qYQxNvj0ftiuyUe3tZo=,1,1,37.0,44.0,"['freedom', 'press', 'acknowledge', 'civilized..."
3,oqEFFiOrA+QVN8mEK0wweRTMmY2FQNB6XE5baB1Wik=,1,1,42.0,94.0,"['high', 'time', 'retire', 'white', 'racist', ..."
4,KZRG5Icfor+M92fyOxzu+dfY5whVco7VarRTmdSwf0=,1,0,2.0,16.0,"['set', 'twitter', 'myfirsttweet', 'man', 'buy..."


In [None]:
# One-hot encode is_retweet from boolean to int value, null values will be set as false


dis_tweets.loc[:, 'is_retweet'] = dis_tweets.loc[:,'is_retweet'].fillna(0)

data = {'False': int(0), 'True': int(1), False: int(0), True: int(1)}

dis_tweets.loc[:, 'is_retweet'] = dis_tweets.loc[:, 'is_retweet'].map(data)

dis_tweets.loc[:, 'is_retweet'] = dis_tweets.loc[:,'is_retweet'].fillna(0)

dis_tweets.loc[:,'is_retweet'] = dis_tweets.loc[:,'is_retweet'].astype('int64')

dis_tweets.head()

Unnamed: 0,tweetid,userid,tweet_language,tweet_time,tweet_client_name,is_retweet,quote_count,like_count,retweet_count,hashtags,urls,user_mentions
0,894109669869985793,4125840934,en,2017-08-06 08:15,WordPress.com,0,0.0,0.0,0.0,"['SyrianArmy', 'Hama', 'InsideSyriaMC']",['http://en.insidesyriamc.com/2017/08/06/new-c...,[]
1,839851611656761344,4125840934,ar,2017-03-09 14:53,WordPress.com,0,0.0,0.0,0.0,['سورية_النظر_من_الداخل'],['http://insidesyriamc.com/2017/03/09/%d8%a7%d...,[]
2,841662351510249472,4125840934,ar,2017-03-14 14:48,WordPress.com,0,0.0,0.0,0.0,"['الجعفري', 'أستانا', 'سورية_النظر_من_الداخل']",['http://insidesyriamc.com/2017/03/14/%d8%a7%d...,[]
3,820599201495187456,4125840934,en,2017-01-15 11:51,Twitter Web Client,0,0.0,1.0,0.0,"['Iraq', 'airstrike']",[],[2876420003]
4,549829105483808768,EjLn9BPgSPWKPhEnebvOIWJ2quALQVMNPVfFgGyzG+8=,en,2014-12-30 07:27,Twitter Web Client,0,0.0,0.0,0.0,,,


In [None]:
# Convert hashtags to number of hashtags used

dis_tweets.loc[:,'hashtags'] = dis_tweets.loc[:,'hashtags'].fillna(0)

dis_hashtags = dis_tweets['hashtags']
hashtag_count = []


for i in range(len(dis_hashtags)):
  count = 0
  hashtags = dis_hashtags.iloc[i]
  try:
    hashtags = hashtags.replace("'", '') 
    hashtags = hashtags.strip('][').split(', ')
    count = len(hashtags)
    hashtag_count.append(count)
  except TypeError:
    hashtag_count.append(count)
  except AttributeError:
    hashtag_count.append(count)

dis_tweets.loc[:,'hashtags'] = hashtag_count

dis_tweets.head()

Unnamed: 0,tweetid,userid,tweet_language,tweet_time,tweet_client_name,is_retweet,quote_count,like_count,retweet_count,hashtags,urls,user_mentions
0,894109669869985793,4125840934,en,2017-08-06 08:15,WordPress.com,0,0.0,0.0,0.0,3,['http://en.insidesyriamc.com/2017/08/06/new-c...,[]
1,839851611656761344,4125840934,ar,2017-03-09 14:53,WordPress.com,0,0.0,0.0,0.0,1,['http://insidesyriamc.com/2017/03/09/%d8%a7%d...,[]
2,841662351510249472,4125840934,ar,2017-03-14 14:48,WordPress.com,0,0.0,0.0,0.0,3,['http://insidesyriamc.com/2017/03/14/%d8%a7%d...,[]
3,820599201495187456,4125840934,en,2017-01-15 11:51,Twitter Web Client,0,0.0,1.0,0.0,2,[],[2876420003]
4,549829105483808768,EjLn9BPgSPWKPhEnebvOIWJ2quALQVMNPVfFgGyzG+8=,en,2014-12-30 07:27,Twitter Web Client,0,0.0,0.0,0.0,0,,


In [None]:
# Convert urls to number of urls used

dis_tweets.loc[:,'urls'] = dis_tweets.loc[:,'urls'].fillna(0)

dis_urls = dis_tweets['urls']
url_count = []


for i in range(len(dis_urls)):
  count = 0
  urls = dis_urls.iloc[i]

  try:
    urls = urls.replace("'", '') 
    urls = urls.strip('][').split(', ')
    count = len(urls)
    url_count.append(count)
  except TypeError:
    url_count.append(count)
  except AttributeError:
    url_count.append(count)

dis_tweets.loc[:,'urls'] = url_count

dis_tweets.head()

Unnamed: 0,tweetid,userid,tweet_language,tweet_time,tweet_client_name,is_retweet,quote_count,like_count,retweet_count,hashtags,urls,user_mentions
0,894109669869985793,4125840934,en,2017-08-06 08:15,WordPress.com,0,0.0,0.0,0.0,3,1,[]
1,839851611656761344,4125840934,ar,2017-03-09 14:53,WordPress.com,0,0.0,0.0,0.0,1,1,[]
2,841662351510249472,4125840934,ar,2017-03-14 14:48,WordPress.com,0,0.0,0.0,0.0,3,1,[]
3,820599201495187456,4125840934,en,2017-01-15 11:51,Twitter Web Client,0,0.0,1.0,0.0,2,1,[2876420003]
4,549829105483808768,EjLn9BPgSPWKPhEnebvOIWJ2quALQVMNPVfFgGyzG+8=,en,2014-12-30 07:27,Twitter Web Client,0,0.0,0.0,0.0,0,0,


In [None]:
# Convert user mentions to number of users mentioned

dis_tweets.loc[:,'user_mentions'] = dis_tweets.loc[:,'user_mentions'].fillna('[]')

dis_user_mentions = dis_tweets['user_mentions']
user_mentions_count = []


for i in range(len(dis_user_mentions)):
  count = 0
  user_mention = dis_user_mentions.iloc[i]
  try:
    user_mention = user_mention.strip('][').split(', ')
    count = len(user_mention)
    user_mentions_count.append(count)
  except TypeError:
    user_mentions_count.append(count)
  except AttributeError:
    user_mentions_count.append(count)

dis_tweets.loc[:,'user_mentions'] = user_mentions_count

dis_tweets.head()

Unnamed: 0,tweetid,userid,tweet_language,tweet_time,tweet_client_name,is_retweet,quote_count,like_count,retweet_count,hashtags,urls,user_mentions
0,894109669869985793,4125840934,en,2017-08-06 08:15,WordPress.com,0,0.0,0.0,0.0,3,1,0
1,839851611656761344,4125840934,ar,2017-03-09 14:53,WordPress.com,0,0.0,0.0,0.0,1,1,0
2,841662351510249472,4125840934,ar,2017-03-14 14:48,WordPress.com,0,0.0,0.0,0.0,3,1,0
3,820599201495187456,4125840934,en,2017-01-15 11:51,Twitter Web Client,0,0.0,1.0,0.0,2,1,0
4,549829105483808768,EjLn9BPgSPWKPhEnebvOIWJ2quALQVMNPVfFgGyzG+8=,en,2014-12-30 07:27,Twitter Web Client,0,0.0,0.0,0.0,0,0,0


In [None]:
# Fill NA and other anomalous values in tweet time with a dummy date to discard in function

dis_tweets.loc[:, 'tweet_time'] = dis_tweets.loc[:, 'tweet_time'].fillna('1900-01-01 00:00')

anomalous_values = dis_tweets[dis_tweets.loc[:, 'tweet_time'].str.endswith('.0')]
anomalous_values = anomalous_values['tweet_time']
for value in anomalous_values:
    dis_tweets.loc[:, 'tweet_time'] = dis_tweets.loc[:,'tweet_time'].replace(value,'1900-01-01 00:00') 

dis_tweets.loc[:, 'tweet_time'] = pd.to_datetime(dis_tweets['tweet_time'], format='%Y-%m-%d %H:%M')

Now that the tweet and user datasets are formatted, we will calculate additional features for each user that may be relevant in differentiating legitimate versus information operation activity

In [None]:
# Calculate additional features for the information operation dataset

# Calculate the ratio of retweets to original tweets
print('calculating retweet ratio')
tweetProcessing.retweetRatio(dis_users, dis_tweets)

# Calculate the ratio of english tweets
print('calculating english tweet ratio')
tweetProcessing.englishRatio(dis_users, dis_tweets)

# Calculate time statistics
print('calculating time statistics')
tweetProcessing.tweet_time_statistics(dis_tweets, dis_users, en=True, non=True)

# Calculate tweet rate statistics
print('calculating tweet rate statistics') 
tweetProcessing.averageTweetNum(dis_users, dis_tweets)

# Calculate tweet engagement metrics
print('calculating engagement metrics')
tweetProcessing.avgTweetMetrics(dis_users, dis_tweets)

# Label the legitimate dataset with binary classification indicating they are part of an information operation
# This will be the feature to predict in the classification model

dis_users['info_op'] = 1
dis_tweets['info_op'] = 1

dis_users

calculating retweet ratio
calculating english tweet ratio
calculating time statistics
calculating tweet rate statistics
calculating engagement metrics


Unnamed: 0,userid,user_reported_location,user_profile_description,follower_count,following_count,BoW,retweet_ratio,english_tweet_proportion,earliest_tweet_time,latest_tweet_time,average_tweet_time,median_tweet_time,tweet_count,stddev_tweet_time,mode_0,mode_1,mode_2,mode_3,mode_4,mode_5,mode_6,mode_7,mode_8,mode_9,mode_10,mode_11,mode_12,mode_13,mode_14,mode_15,mode_16,mode_17,mode_18,mode_19,mode_20,mode_21,mode_22,mode_23,avg_tweets_per_week,avg_tweets_per_day,avg_tweets_per_hour,avg_tweets_per_min,avg_quote_count,avg_like_count,avg_retweet_count,avg_hashtags,avg_urls,avg_user_mentions,info_op
0,CqW9bECdw2Jjk9DDU7UyE6P59TukYFISNE8J6sN66u4=,0,1,691.0,1490.0,"['schumer', 'fight', 'hard', 'healthcare', 'br...",0.841410,0.986784,954,2322,1622,1625,227,214,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,17.461538,2.609195,0.109927,0.001833,0.013216,0.312775,0.273128,1.052863,1.000000,0.0,1
1,uOrf1TDmM7vP4YEhOJDXORoqvpDlsJt03AyOfhrZo=,1,1,3152.0,4142.0,"['watersecurity', 'jordan', 'crucial', 'mainta...",0.243511,0.981459,823,2337,1531,1547,809,244,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,23.114286,3.384937,0.141681,0.002362,0.074166,1.090235,0.724351,1.034611,1.000000,0.0,1
2,LXW4uuq2JWx4So6ycDFanp4qYQxNvj0ftiuyUe3tZo=,1,1,37.0,44.0,"['freedom', 'press', 'acknowledge', 'civilized...",0.750000,0.750000,837,1433,1112,1056,4,27,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1.333333,0.307692,0.013559,0.000227,0.000000,0.000000,0.000000,1.250000,1.000000,0.0,1
3,oqEFFiOrA+QVN8mEK0wweRTMmY2FQNB6XE5baB1Wik=,1,1,42.0,94.0,"['high', 'time', 'retire', 'white', 'racist', ...",0.971698,0.981132,831,2002,1514,1520,106,316,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,8.153846,1.191011,0.050166,0.000837,0.000000,0.000000,0.000000,1.028302,1.000000,0.0,1
4,KZRG5Icfor+M92fyOxzu+dfY5whVco7VarRTmdSwf0=,1,0,2.0,16.0,"['set', 'twitter', 'myfirsttweet', 'man', 'buy...",0.934426,0.032787,123,1924,1342,1354,305,236,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,4.765625,0.690045,0.028820,0.000480,0.003279,0.009836,0.000000,1.006557,1.000000,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4162,3102283ec467ed58ced76e906687a85483d5eeb18e2de4...,1,1,140.0,352.0,"['syria', 'terrorists', 'prepare', 'shell', 'c...",0.640954,0.002805,1,2357,1342,1346,713,334,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,4.917241,0.710159,0.029618,0.000494,0.000000,0.001403,0.032258,0.907433,0.715288,0.0,1
4163,9f7867b0b3f46848b44c3048cc85d7bd425a9b8d63f53d...,0,0,461.0,462.0,"['tour', 'nyc', 'giant', 'circle', 'skateboard...",0.296842,0.008293,0,2359,102,1156,6997,654,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,127.218182,18.413158,0.770086,0.012836,0.000000,0.000286,0.118782,0.457911,0.480777,0.0,1
4164,53af1887d91472f63c8639ce894a58d09a86691f114b62...,1,1,42.0,259.0,"['explosion', 'occur', 'chemical', 'plant', 'l...",0.666667,1.000000,1412,1704,1512,1424,3,118,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0.150000,0.022901,0.000960,0.000016,0.000000,0.000000,0.000000,1.000000,0.333333,0.0,1
4165,iZ328VglWrG25qPym1bifLoiwXD9v1+A3G4WU5AThso=,1,1,2718.0,264.0,"['defense', 'secretary', 'james', 'mattis', 'k...",0.801866,0.831993,0,2359,137,750,154508,739,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1980.871795,284.022059,11.871533,0.197881,0.037569,0.103639,0.128765,1.800656,1.001967,0.0,1


In [None]:
# Delete tweet language field since we no longer need it

dis_tweets = dis_tweets.drop(['tweet_language'], axis=1)

In [None]:
# Upon further inspection, I found some extra special character in each users BoW, so this code removes them

BoW_dis_list = dis_users['BoW'].str.replace("'",'')
BoW_dis_list = BoW_dis_list[BoW_dis_list.notnull()].apply(lambda x: re.sub('[^,a-zA-Z0-9]+', '', x))
BoW_dis_list = BoW_dis_list[BoW_dis_list.notnull()].apply(lambda x: x.strip('][').split(','))
BoW_dis_list = BoW_dis_list[BoW_dis_list.notnull()].apply(lambda x: list(filter(None, x)))
dis_users['BoW'] = BoW_dis_list

dis_users['BoW'].head()

0    [schumer, fight, hard, healthcare, breaking, r...
1    [watersecurity, jordan, crucial, maintaining, ...
2    [freedom, press, acknowledge, civilized, world...
3    [high, time, retire, white, racist, congress, ...
4    [set, twitter, myfirsttweet, man, buys, new, h...
Name: BoW, dtype: object

In [None]:
# Save disinformation dataset to csv

dis_tweets.to_csv('/mypath/Step 3 - Feature Generation/Processed_Data_Step3/dis_tweets_processed.csv', index=False)
dis_users.to_csv('/mypath/Step 3 - Feature Generation/Processed_Data_Step3/dis_users_processed.csv', index=False)

In [None]:
# Grab all datasets to combine and compare

leguserfile = drive.CreateFile({'id': 'insert file id'}) 
leguserfile.GetContentFile('leg_users_processed.csv')

leg_users = pd.read_csv('leg_users_processed.csv')

legtweetfile = drive.CreateFile({'id': 'insert file id'}) 
legtweetfile.GetContentFile('leg_tweets_processed.csv')

leg_tweets = pd.read_csv('leg_tweets_processed.csv')

disuserfile = drive.CreateFile({'id': 'insert file id'}) 
disuserfile.GetContentFile('dis_users_processed.csv')

dis_users = pd.read_csv('dis_users_processed.csv')

distweetfile = drive.CreateFile({'id': 'insert file id'}) 
distweetfile.GetContentFile('dis_tweets_processed.csv')

dis_tweets = pd.read_csv('dis_tweets_processed.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
# Make sure both user datasets turned out with the same number of features

dis_users.shape, leg_users.shape, dis_tweets.shape, leg_tweets.shape

((4167, 49), (4427, 49), (12102869, 12), (11231672, 11))

In [None]:
# Now we will combine the tweet and user datasets for IO/non-IO and store each in a csv

all_tweets = pd.concat([leg_tweets, dis_tweets], ignore_index=True)

del leg_tweets, dis_tweets

all_users = pd.concat([leg_users, dis_users], ignore_index=True)

del leg_users, dis_users

# Save the final data to csv as inputs to Step 4

all_tweets.to_csv('/mypath/Step 4 - Exploratory Data Analysis and Classification Model Development/Input_Data_Step4/all_tweets_processed.csv', index=False)

all_users.to_csv('/mypath/Step 4 - Exploratory Data Analysis and Classification Model Development/Input_Data_Step4/all_users_processed.csv', index=False)
