In [1]:
import numpy as np
import pandas as pd
import re
import warnings
import sys
import os
from collections import defaultdict
import itertools
import json

#Visualisation
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS

#nltk
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.util import *
from nltk import tokenize
import nltk
from nltk.corpus import stopwords
import operator
import string

#twitter api requests
import tweepy

%matplotlib inline



In [2]:
f1 = "ira_tweets_csv_hashed.csv"
f2 = "iranian_tweets_csv_hashed.csv"

# About the data

* Justice Department charged 13 Russian nationals with interfering in American electoral and political processes. The defendants worked for a well-funded “troll factory” called the Internet Research Agency, which had 400 employees.
* They ran a campaign to sow disinformation and discord into American politics via social media (mostly twitter).
* Dataset includes information from 3,841 accounts believed to be connected to the Russian Internet Research Agency, and 770 accounts believed to originate in Iran. Includes all public, nondeleted tweets and media (e.g., images and videos) from accounts believed to be connected to state-backed information operations.
* ~1.4 million people have now received a notification from Twitter for directly engaging during the election period with the 3,814 IRA-linked accounts identified (either by retweeting, quoting, replying to, mentioning, or liking those accounts or content created by those accounts) and actively following one of the identified IRA-linked accounts at the time those accounts were suspended.


In [3]:
ira = pd.read_csv(f1, error_bad_lines=False)
# iran = pd.read_csv(f2, error_bad_lines=False)

  interactivity=interactivity, compiler=compiler, result=result)


# What info does the dataset contain?

In [4]:
ira.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9041308 entries, 0 to 9041307
Data columns (total 31 columns):
tweetid                     int64
userid                      object
user_display_name           object
user_screen_name            object
user_reported_location      object
user_profile_description    object
user_profile_url            object
follower_count              int64
following_count             int64
account_creation_date       object
account_language            object
tweet_language              object
tweet_text                  object
tweet_time                  object
tweet_client_name           object
in_reply_to_tweetid         float64
in_reply_to_userid          object
quoted_tweet_tweetid        float64
is_retweet                  bool
retweet_userid              object
retweet_tweetid             float64
latitude                    float64
longitude                   float64
quote_count                 float64
reply_count                 float64
like_count

# Analysis of accounts' popularity and activity

In [5]:
cols = ['follower_count', 'following_count', 'quote_count', 'reply_count', 'like_count', 'retweet_count']
ira[cols].describe()

Unnamed: 0,follower_count,following_count,quote_count,reply_count,like_count,retweet_count
count,9041308.0,9041308.0,9038635.0,9038635.0,9038635.0,9038635.0
mean,8670.202,2522.468,0.1981891,0.2810441,4.002723,3.45744
std,22146.39,5028.831,13.07364,7.408997,290.3125,140.327
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,346.0,284.0,0.0,0.0,0.0,0.0
50%,842.0,618.0,0.0,0.0,0.0,0.0
75%,4486.0,2014.0,0.0,0.0,0.0,0.0
max,257638.0,74664.0,11633.0,3249.0,325826.0,123617.0


From the above table we can draw the following conclusions:
- The average number of followers the fake accounts had was around 8600.
- The average number of people the fake accounts were following were around 2522.
- Although interestingly, 75% of the accounts had no significant account activity like replying to tweets, liking tweets, retweeting, etc.
- But 75% of the accounts had nearly 4500 followers and hence there outreach was large.

# Analysis on tweet locations

Below are the top 50 tweet locations. It's no surprise that USA is at the top, closely followed by Russian states.

In [6]:
ira['user_reported_location'].value_counts().nlargest(50)

USA                         774819
Москва                      737454
Санкт-Петербург             316650
United States               302032
Estados Unidos              285012
Питер                       220464
Россия                      184465
Moscow                      146707
Los Angeles, CA             127069
Новосибирск                 124469
МSK                         122025
Санкт-Петербург, Россия     113974
Чебоксары                   111773
Новгород                    106124
Омск                        100859
Россия                       95841
Chicago, IL                  90612
New York, NY                 69659
Atlanta                      65969
СПБ                          64731
Russia                       59693
New York, USA                58856
Москва, Россия               58601
Kansas City, MO              52854
Киев                         46806
New Orleans, LA              46592
Saint Petersburg, Russia     45760
San Francisco, CA            45350
СПб                 

# Most frequently occuring words in the tweets

In [4]:
ira_english = ira.loc[ira['tweet_language'] == 'en']
# ira_spanish = ira.loc[ira['tweet_language'] == 'es']

In [32]:
keyword_count = defaultdict(int)
stopwords = set(stopwords.words('english'))
stopwords.insert('rt')
stopwords.insert('rt')
stopwords.insert('rt')
whitelist = set('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')
all_tweets = ira_english[['tweet_text']].values
for tweet in all_tweets:
    for word in tweet[0].split():
        word = word.strip().lower()
        word = ''.join(filter(whitelist.__contains__, word))
        if word not in stopwords and word != '':
            keyword_count[word] += 1

Below is the list of the 100 most frequently occuring words in the tweets (barring stopwords in English like a, the, etc), sorted in the decreasing order of counts.

In [40]:
sorted(keyword_count.items(), key=operator.itemgetter(1), reverse=True)[:100]

[('rt', 1188742),
 ('', 878395),
 ('news', 273323),
 ('trump', 157029),
 ('sports', 104621),
 ('new', 102660),
 ('us', 92666),
 ('dont', 88298),
 ('people', 85228),
 ('like', 84064),
 ('politics', 83948),
 ('man', 83442),
 ('amp', 80070),
 ('love', 78591),
 ('im', 76677),
 ('one', 72957),
 ('police', 71340),
 ('get', 70330),
 ('world', 64247),
 ('local', 60820),
 ('obama', 59935),
 ('time', 53993),
 ('breaking', 53911),
 ('make', 52786),
 ('life', 52701),
 ('know', 50646),
 ('day', 48114),
 ('video', 47259),
 ('never', 47129),
 ('says', 46439),
 ('good', 46437),
 ('black', 45813),
 ('want', 45147),
 ('go', 43943),
 ('workout', 43845),
 ('need', 40676),
 ('president', 39866),
 ('say', 39697),
 ('cant', 39613),
 ('back', 38142),
 ('white', 38029),
 ('https', 37972),
 ('would', 37153),
 ('business', 36458),
 ('see', 36378),
 ('chicago', 35732),
 ('hillary', 35464),
 ('first', 35378),
 ('think', 35339),
 ('right', 34789),
 ('america', 34071),
 ('health', 33881),
 ('woman', 33349),
 ('via',

# Most frequently occuring hashtags

In [52]:
hashtags = ira[['hashtags']].values
hashtag_count = defaultdict(int)
for entry in hashtags:
    if isinstance(entry[0], str):
        all_hashtags = entry[0].strip('[,]').split()
        for hashtag in all_hashtags:
            hashtag_count[hashtag.lower()] += 1

Below is a list of the 100 most frequently hashtags in all tweets, sorted in the decreasing order of counts.

In [55]:
sorted(hashtag_count.items(), key=operator.itemgetter(1), reverse=True)[:100]

[('news', 230977),
 ('sports', 100987),
 ('politics', 76107),
 ('спб', 65508),
 ('новости', 51670),
 ('local', 31802),
 ('world,', 28173),
 ('русскийдух', 26936),
 ('россия,', 26704),
 ('local,', 24278),
 ('провокациякиева', 22197),
 ('киевсбилбоинг', 22099),
 ('health', 21942),
 ('киевскажиправду', 21807),
 ('news,', 19881),
 ('business', 19220),
 ('maga', 19107),
 ('breaking', 16675),
 ('topnews', 16390),
 ('chicago', 16104),
 ('образроссии', 14765),
 ('россия', 13918),
 ('невскиеновости,', 13580),
 ('newyork', 13378),
 ('украина', 13326),
 ('usa', 13128),
 ('maga,', 12606),
 ('новости,', 12372),
 ('кино', 11541),
 ('rap', 11406),
 ('невскиеновости', 11297),
 ('foke', 11173),
 ('showbiz', 11132),
 ('вернитекалифорнию', 11019),
 ('авто', 10964),
 ('спорт', 10806),
 ('tech', 10779),
 ('entertainment', 10575),
 ('love', 10560),
 ('fukushima2015', 10556),
 ('blacklivesmatter', 10488),
 ('quote,', 10120),
 ('битваолигархов', 10023),
 ('tcot,', 10023),
 ('музыка', 9695),
 ('спб,', 9514),
 

In [8]:
sorted_fc = ira_english.sort_values(['follower_count', 'following_count', 'quote_count', 'reply_count', 'like_count', 'retweet_count'], ascending=False)
sorted_fc = sorted_fc.groupby(['userid', 'user_display_name'])[['follower_count', 'following_count', 'quote_count', 'reply_count', 'like_count', 'retweet_count']].mean()
sorted_fc = sorted_fc.sort_values(['follower_count', 'following_count', 'quote_count', 'reply_count', 'like_count', 'retweet_count'], ascending=False)

- Below is a truncated list of the top 100 accounts sorted by account activity (which includes number of followers, number of replies, retweets and quotes and number of people followed by the account). The table includes the account ids which we will use to retrieve a list of followers using the twitter apis, the account username, the of followers, the number of people followed by the account, the average quote count, the average reply count, the average like count and the average tweet count.
- It is evident that most of the top accounts are of Russian origin.

In [9]:
sorted_fc[:100]

Unnamed: 0_level_0,Unnamed: 1_level_0,follower_count,following_count,quote_count,reply_count,like_count,retweet_count
userid,user_display_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2527472164,Вестник Москвы,257638,544,0.166667,0.166667,9.666667,4.500000
508761973,Вестник Петербурга,149672,1024,0.000000,0.055556,1.083333,2.000000
4224729994,Tennessee,147767,74664,49.569508,60.092487,666.755435,570.817792
2808833544,Максим Дементьев,134805,2796,0.006667,0.306667,2.146667,1.060000
449689677,Рамзан Кадыров,123989,10,0.000000,2.847826,5.413043,13.695652
2648734430,Вестник Крыма,106462,386,0.142857,0.714286,1.000000,0.285714
3676820373,Вестник Красноярска,85293,316,0.000000,0.000000,1.000000,1.000000
2665564544,Мюсли Лаврова,84642,2575,0.000000,0.108696,0.239130,0.565217
2882331822,Jenna Abrams,79152,22607,2.254825,4.691032,30.377963,23.453089
4272870988,Pamela Moore,72121,42080,34.008049,40.015112,379.541721,364.332129


After checking all user ids in the dataset with the twitter ids it was evident that all these accounts have been suspended. So, it is impossible to get a list of followers for each account.

# Get list of all mentions and their counts

Doing this just for English right now as most of the mentions in the Russian tweets (for example) were mostly Russian newspapers and microblogging sites.

In [8]:
mentions = ira_english[['user_mentions']].values
mention_count = defaultdict(int)
for entry in mentions:
    if isinstance(entry[0], str):
        all_mentions = entry[0].strip('[,]').split()
        for mention in all_mentions:
            mention_count[mention] += 1
mention_count = sorted(mention_count.items(), key=operator.itemgetter(1), reverse=True)

There are 705382 unique mentions is all the English tweets in the dataset.

# Get a list of all users who were retweeted by IRA (English only)

In [5]:
retweeters = ira_english[['retweet_userid']].values
fake_acc = ira_english[['userid']].values
retweeter_count = defaultdict(int)
#retweeter_orig = defaultdict(set)
for i, entry in enumerate(retweeters):
    if isinstance(entry[0], str):
        all_retweeters = entry[0].strip('[,]').split()
        for retweeter in all_retweeters:
            retweeter_count[retweeter] += 1
            #retweeter_orig[retweeter].add(fake_acc[i][0])
retweeter_count = sorted(retweeter_count.items(), key=operator.itemgetter(1), reverse=True)

# Get all users who tweeted keywords (only English tweets)

In [6]:
keywords = ['criminals', 'felons', 'abortion', 'immigration', 'immigrants', 'illegal', 'aliens', 'guns', 'gun', 'violence', 'race', 'black', 'cops', 'pride', 'kings', 'queens', 'blackgirlmagic', 'blacklivesmatter']

In [7]:
whitelist = set('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')
keyword_retweeters = ira_english[['userid', 'tweet_text', 'retweet_userid', 'user_screen_name']].values
keyword_retweeter_count = defaultdict(int)
keyword_retweeter_tweets = defaultdict(list)
#keyword_retweeter_orig = defaultdict(list)
for entry in keyword_retweeters:
    tweet = entry[1].split()
    flag = False
    for word in tweet:
        word = ''.join(filter(whitelist.__contains__, word))
        if word in keywords:
            flag = True
    if isinstance(entry[2], str) and flag is True: # check for nan in retweet_useid column
        all_retweeters = entry[2].strip('[,]').split()
        for retweeter in all_retweeters:
            keyword_retweeter_count[retweeter] += 1
            keyword_retweeter_tweets[retweeter].append(entry[1])
            #keyword_retweeter_orig[retweeter].append([entry[0], entry[3]])
keyword_retweeter_count = sorted(keyword_retweeter_count.items(), key=operator.itemgetter(1), reverse=True)

There are 7331 unique tweeters of English tweets containing the above mentioned keywords in the dataset.

# Login to twitter API

In [5]:
twitter_creds = None
with open("twitter_credentials.json") as file:  
    twitter_creds = json.load(file)
auth = tweepy.auth.OAuthHandler(twitter_creds['consumer_key'], twitter_creds['consumer_secret'])
auth.set_access_token(twitter_creds['access_key'], twitter_creds['access_secret'])
api = tweepy.API(auth, wait_on_rate_limit=True)

if(api.verify_credentials):
    print ('Successfully logged in')

Successfully logged in


# Get list of users who mentioned fake IRA accounts

In [28]:
all_names = ira_english[['userid', 'user_screen_name', 'retweet_count']]
all_names = all_names.sort_values(by='retweet_count', ascending=False).values
filtered_names = defaultdict()
for entry in all_names:
    if entry[0].isdigit():
        filtered_names[entry[1]] = entry[2]
filtered_names = list(filtered_names.keys())

In [30]:
len(filtered_names)

135

In [31]:
print(filtered_names)

['Crystal1Johnson', 'KaniJJackson', 'JemiSHaaaZzz', 'wokeluisa', 'TrayneshaCole', 'BleepThePolice', 'BlackToLive', 'gloed_up', 'TEN_GOP', 'BlackNewsOutlet', 'LaChristie', 'Pamela_Moore13', 'JohnCopper16', 'Jenn_Abrams', 'USA_Gunslinger', 'SouthLoneStar', '10_gop', 'realTEN_GOP', 'ELEVEN_GOP', 'PamelaKealer13', 'Jeblary2016', 'Blk_Voice', 'TheFoundingSon', 'blackmattersus', 'WarfareWW', 'Baltimore0nline', 'patriototus', 'tpartynews', 'PigeonToday', 'MissouriNewsUS', 'todayinsyria', 'redlanews', 'DanaGeezus', 'MatEvidence', 'southlonestar2', 'MargoSavazh', 'rightnpr', 'Politweecs', 'DailyLosAngeles', 'Jihadist2ndWife', 'March_for_Trump', 'DetroitDailyNew', 'Patriot_Archive', 'DallasTopNews', 'Seattle_Post', 'GiselleEvns', 'AndyHashtagger', 'DickyIrwin', 'TodayNYCity', 'ItsTimeToSecede', 'NewOrleansON', 'DailySanFran', 'TodayMiami', 'byDrBre', 'WashingtOnline', 'LoraGreeen', 'KadirovRussia', 'HoustonTopNews', 'PhoenixDailyNew', 'ChrixMorgan', 'WhiteHouseCards', 'DominicValent', 'ComradZam

In [42]:
whitelist = set('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')
keywords = ['criminals', 'felons', 'abortion', 'immigration', 'immigrants', 'illegal', 'aliens', 'guns', 'gun', 'violence', 'race', 'black', 'cops', 'pride', 'kings', 'queens', 'blackgirlmagic', 'blacklivesmatter']
locations = ['wisconsin', 'iowa', 'pennsylvania', 'florida', 'ohio', 'virginia', 'carolina', 'georgia']
filtered_user_dict = defaultdict(list)
#all_user_dict = defaultdict(list)
done = []
for name in filtered_names:
    for item in tweepy.Cursor(api.search, q='@'+name, count=100000).items():
        tweet = item.text.split()
        text_flag = False
        for word in tweet:
            word = ''.join(filter(whitelist.__contains__, word))
            if word in keywords:
                text_flag = True
        loc = item.author.location.split()
        loc_flag = False
        for entry in loc:
            entry = entry.lower().strip(',. ')
            if entry in locations:
                loc_flag = True
        if loc_flag is True:
            filtered_user_dict['user_id'].append(item.author.id_str)
            filtered_user_dict['user_screen_name'].append(item.author.name)
            filtered_user_dict['user_location'].append(item.author.location)
            filtered_user_dict['user_description'].append(item.author.description)
            filtered_user_dict['user_follower_count'].append(item.author.followers_count)
            filtered_user_dict['user_friend_count'].append(item.author.friends_count)
            filtered_user_dict['user_account_creation_date'].append(item.author.created_at)
            filtered_user_dict['user_tweet'].append(item.text)
            filtered_user_dict['fakebot_screen_name'].append(name)
    done.append(name)


In [43]:
filtered_user_dict_df = pd.DataFrame(filtered_user_dict)

In [44]:
filtered_user_dict_df

Unnamed: 0,user_id,user_screen_name,user_location,user_description,user_follower_count,user_friend_count,user_account_creation_date,user_tweet,fakebot_screen_name
0,4180736069,Dmitch EX-GOP Consrv,"Virginia, USA",Seek God 1st/Prolife Conserv #NeverTrump @Evan...,2196,4996,2015-11-13 18:52:58,RT @LouiseBagshawe: On the 12h day of Muellerm...,TEN_GOP
1,4837060487,individual-1sFedcase,"Wisconsin, USA",Daughter of a career Naval Flight Surgeon Sist...,1185,538,2016-01-23 01:49:55,RT @SlickRockWeb: Did a quick search in our ar...,USA_Gunslinger
2,805636147649138689,Joe,"Ohio, USA","Radical moderate, Dad,Buckeye,Vonnegut, Gonzo,...",3330,2024,2016-12-05 04:53:20,@LFredenhall @thespybrief @visionsurreal @GWil...,March_for_Trump
3,805636147649138689,Joe,"Ohio, USA","Radical moderate, Dad,Buckeye,Vonnegut, Gonzo,...",3330,2024,2016-12-05 04:53:20,@GWillicker8 @thespybrief @visionsurreal @LFre...,March_for_Trump
4,805636147649138689,Joe,"Ohio, USA","Radical moderate, Dad,Buckeye,Vonnegut, Gonzo,...",3330,2024,2016-12-05 04:53:20,@thespybrief @visionsurreal @LFredenhall @GWil...,March_for_Trump
5,805636147649138689,Joe,"Ohio, USA","Radical moderate, Dad,Buckeye,Vonnegut, Gonzo,...",3330,2024,2016-12-05 04:53:20,@thespybrief @visionsurreal @LFredenhall @GWil...,March_for_Trump
6,911242297,Sara Chirico,"Virginia, USA","wife, proud mom with 5 precious grandkids .......",582,1454,2012-10-28 22:30:21,RT @lovetogive2: @BenefitVBurden @mikefarb1 @R...,DetroitDailyNew


In [38]:
len(filtered_user_dict_df['fakebot_screen_name'].unique())

15

In [21]:
filtered_user_dict_df.to_csv('mentions_all.csv')

# Use API to get user info

In [168]:
all_retweeters_df = pd.read_csv('./filtered_users.csv', error_bad_lines=False)
all_retweeters_ids = all_retweeters_df['user_id'].unique()
all_retweeters_ids = [str(entry) for entry in all_retweeters_ids]

In [169]:
len(all_retweeters_ids)

211

In [170]:
filtered = []
for retweeter, cnt in keyword_retweeter_count:
    if retweeter not in done and retweeter not in all_retweeters_ids:
        try:
            user = api.get_user(retweeter)
            print(retweeter, user.screen_name, user.location)
            filtered.append(user)
            done.append(retweeter)
        except tweepy.TweepError as e:
            message = e.args[0][0]['message']
            if message == 'User has been suspended.' or message == 'User not found.':
                done.append(retweeter)

In [171]:
print(len(filtered), len(done)) #7246

0 7246


In [162]:
# Filter users according to locations
locations = ['wisconsin', 'iowa', 'pennsylvania', 'florida', 'ohio', 'virginia', 'carolina', 'georgia']
retweeter_dict = defaultdict(list)
for r in filtered:
    loc = r.location.split()
    for entry in loc:
        entry = entry.lower().strip(',. ')
        if entry in locations:
            retweeter_dict[entry].append(tuple([r.id, r.screen_name, r.location]))

In [163]:
res = defaultdict(list)
for i in range(len(filtered)):
    for location in retweeter_dict.keys():
        flag = False
        for user in retweeter_dict[location]:
            if filtered[i].id == user[0]:
                flag = True
                break
        if flag == True:
            res['user_id'].append(filtered[i].id)
            res['user_screen_name'].append(filtered[i].screen_name)
            res['user_location'].append(location)
            res['user_description'].append(filtered[i].description)
            res['user_follower_count'].append(filtered[i].followers_count)
            res['user_friend_count'].append(filtered[i].friends_count)
            res['user_account_creation_date'].append(filtered[i].created_at)
            res['user_tweets'].append(keyword_retweeter_tweets[filtered[i].id_str])
            break

In [164]:
res_df = pd.DataFrame(res)

In [165]:
df_final = all_retweeters_df.append(res_df)
df_final = df_final.drop(df_final.columns[[0]], axis=1) 

In [166]:
df_final

Unnamed: 0,user_follower_count,user_friend_count,user_id,user_location,user_screen_name,user_tweets
0,43748,48175,905019768,florida,Conservatexian,"['RT @Conservatexian: News post: ""In Texas, un..."
1,44019,10578,237888723,florida,BIZPACReview,['RT @BIZPACReview: ‘Best Senate race ever?’ S...
2,53034,5972,2164876369,georgia,RepStevenSmith,['RT @RepStevenSmith: So Trump ran on immigrat...
3,34785,22914,2928451870,virginia,SenRichardBlack,"['RT @SenRichardBlack: SB1455, my bill to make..."
4,2986,469,2880084329,pennsylvania,50nsexy2014,"[""RT @50nsexy2014: Neither democrats or republ..."
5,86170,1038,27995424,florida,johncardillo,"['RT @johncardillo: .@FoxNews, nice anti gun p..."
6,2233,2370,1380313111,ohio,joey_toledo,"[""RT @joey_toledo: I'm from a small city in my..."
7,164321,148658,3297022953,carolina,kwilli1046,"['RT @kwilli1046: Marine Le Pen: ""Mass immigra..."
8,726,736,796102909318598660,virginia,not_2dayplease,['RT @wild14_u: #TheFirst100DaysToDoList Supp...
9,13089,14168,16007872,iowa,cmdorsey,['RT @cmdorsey: If we did everything LEFTISTS ...


In [167]:
print(len(df_final['user_id'].unique()), len(df_final['user_screen_name'].unique()))

211 211


In [134]:
df_final.to_csv('original_tweeters.csv', index=False)

In [29]:
all_mentions = []
for mention in mention_count:
    try:
        user = api.get_user(mention)
        print(user.screen_name)
        all_mentions.append(user)
    except:
        pass

In [None]:
with open("mentions.pkl", 'w') as f:
    pickle.dump(all_mentions, f)

In [34]:
def wordcloud(text):
    wordcloud = WordCloud(background_color="white", stopwords=stopwords, random_state = 42
                         ).generate(text)
    plt.figure( figsize=(20,10), facecolor='k')
    plt.imshow(wordcloud)
    plt.axis("off") 