In [1]:
import numpy as np
import pandas as pd
import re
import warnings
import sys
import os
from collections import defaultdict
import itertools

#Visualisation
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS

#nltk
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.util import *
from nltk import tokenize
import nltk
from nltk.corpus import stopwords
import operator
import string

#api requests
import requests

%matplotlib inline



In [2]:
f1 = "ira_tweets_csv_hashed.csv";
f2 = "iranian_tweets_csv_hashed.csv"

# About the data

* Justice Department charged 13 Russian nationals with interfering in American electoral and political processes. The defendants worked for a well-funded “troll factory” called the Internet Research Agency, which had 400 employees.
* They ran a campaign to sow disinformation and discord into American politics via social media (mostly twitter).
* Dataset includes information from 3,841 accounts believed to be connected to the Russian Internet Research Agency, and 770 accounts believed to originate in Iran. Includes all public, nondeleted tweets and media (e.g., images and videos) from accounts believed to be connected to state-backed information operations.
* ~1.4 million people have now received a notification from Twitter for directly engaging during the election period with the 3,814 IRA-linked accounts identified (either by retweeting, quoting, replying to, mentioning, or liking those accounts or content created by those accounts) and actively following one of the identified IRA-linked accounts at the time those accounts were suspended.


In [3]:
ira = pd.read_csv(f1, error_bad_lines=False)
iran = pd.read_csv(f2, error_bad_lines=False)

  interactivity=interactivity, compiler=compiler, result=result)


# What info does the dataset contain?

In [4]:
ira.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9041308 entries, 0 to 9041307
Data columns (total 31 columns):
tweetid                     int64
userid                      object
user_display_name           object
user_screen_name            object
user_reported_location      object
user_profile_description    object
user_profile_url            object
follower_count              int64
following_count             int64
account_creation_date       object
account_language            object
tweet_language              object
tweet_text                  object
tweet_time                  object
tweet_client_name           object
in_reply_to_tweetid         float64
in_reply_to_userid          object
quoted_tweet_tweetid        float64
is_retweet                  bool
retweet_userid              object
retweet_tweetid             float64
latitude                    float64
longitude                   float64
quote_count                 float64
reply_count                 float64
like_count

# Analysis of accounts' popularity and activity

In [5]:
cols = ['follower_count', 'following_count', 'quote_count', 'reply_count', 'like_count', 'retweet_count']
ira[cols].describe()

Unnamed: 0,follower_count,following_count,quote_count,reply_count,like_count,retweet_count
count,9041308.0,9041308.0,9038635.0,9038635.0,9038635.0,9038635.0
mean,8670.202,2522.468,0.1981891,0.2810441,4.002723,3.45744
std,22146.39,5028.831,13.07364,7.408997,290.3125,140.327
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,346.0,284.0,0.0,0.0,0.0,0.0
50%,842.0,618.0,0.0,0.0,0.0,0.0
75%,4486.0,2014.0,0.0,0.0,0.0,0.0
max,257638.0,74664.0,11633.0,3249.0,325826.0,123617.0


From the above table we can draw the following conclusions:
- The average number of followers the fake accounts had was around 8600.
- The average number of people the fake accounts were following were around 2522.
- Although interestingly, 75% of the accounts had no significant account activity like replying to tweets, liking tweets, retweeting, etc.
- But 75% of the accounts had nearly 4500 followers and hence there outreach was large.

# Analysis on tweet locations

Below are the top 50 tweet locations. It's no surprise that USA is at the top, closely followed by Russian states.

In [6]:
ira['user_reported_location'].value_counts().nlargest(50)

USA                         774819
Москва                      737454
Санкт-Петербург             316650
United States               302032
Estados Unidos              285012
Питер                       220464
Россия                      184465
Moscow                      146707
Los Angeles, CA             127069
Новосибирск                 124469
МSK                         122025
Санкт-Петербург, Россия     113974
Чебоксары                   111773
Новгород                    106124
Омск                        100859
Россия                       95841
Chicago, IL                  90612
New York, NY                 69659
Atlanta                      65969
СПБ                          64731
Russia                       59693
New York, USA                58856
Москва, Россия               58601
Kansas City, MO              52854
Киев                         46806
New Orleans, LA              46592
Saint Petersburg, Russia     45760
San Francisco, CA            45350
СПб                 

# Most frequently occuring words in the tweets

In [7]:
ira_english = ira.loc[ira['tweet_language'] == 'en']

In [32]:
keyword_count = defaultdict(int)
stopwords = set(stopwords.words('english'))
stopwords.insert('rt')
stopwords.insert('rt')
stopwords.insert('rt')
whitelist = set('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')
all_tweets = ira_english[['tweet_text']].values
for tweet in all_tweets:
    for word in tweet[0].split():
        word = word.strip().lower()
        word = ''.join(filter(whitelist.__contains__, word))
        if word not in stopwords and word != '':
            keyword_count[word] += 1

Below is the list of the 100 most frequently occuring words in the tweets (barring stopwords in English like a, the, etc), sorted in the decreasing order of counts.

In [40]:
sorted(keyword_count.items(), key=operator.itemgetter(1), reverse=True)[:100]

[('rt', 1188742),
 ('', 878395),
 ('news', 273323),
 ('trump', 157029),
 ('sports', 104621),
 ('new', 102660),
 ('us', 92666),
 ('dont', 88298),
 ('people', 85228),
 ('like', 84064),
 ('politics', 83948),
 ('man', 83442),
 ('amp', 80070),
 ('love', 78591),
 ('im', 76677),
 ('one', 72957),
 ('police', 71340),
 ('get', 70330),
 ('world', 64247),
 ('local', 60820),
 ('obama', 59935),
 ('time', 53993),
 ('breaking', 53911),
 ('make', 52786),
 ('life', 52701),
 ('know', 50646),
 ('day', 48114),
 ('video', 47259),
 ('never', 47129),
 ('says', 46439),
 ('good', 46437),
 ('black', 45813),
 ('want', 45147),
 ('go', 43943),
 ('workout', 43845),
 ('need', 40676),
 ('president', 39866),
 ('say', 39697),
 ('cant', 39613),
 ('back', 38142),
 ('white', 38029),
 ('https', 37972),
 ('would', 37153),
 ('business', 36458),
 ('see', 36378),
 ('chicago', 35732),
 ('hillary', 35464),
 ('first', 35378),
 ('think', 35339),
 ('right', 34789),
 ('america', 34071),
 ('health', 33881),
 ('woman', 33349),
 ('via',

# Most frequently occuring hashtags

In [69]:
hashtags = ira_english[['hashtags']].values
hashtag_count = defaultdict(int)
for entry in hashtags:
    if isinstance(entry[0], str):
        all_hashtags = entry[0].strip('[,]').split()
        for hashtag in all_hashtags:
            hashtag_count[hashtag] += 1

Below is a list of the 100 most frequently hashtags in tweets, sorted in the decreasing order of counts.

In [70]:
sorted(hashtag_count.items(), key=operator.itemgetter(1), reverse=True)[:100]

[('news', 225945),
 ('sports', 96302),
 ('politics', 74754),
 ('local', 31425),
 ('world,', 27522),
 ('local,', 22466),
 ('health', 21560),
 ('business', 18935),
 ('TopNews', 16245),
 ('Chicago', 15827),
 ('news,', 15796),
 ('breaking', 13402),
 ('NewYork', 11760),
 ('MAGA', 11200),
 ('Foke', 11090),
 ('showbiz', 10869),
 ('tech', 10539),
 ('love', 10287),
 ('quote,', 10031),
 ('entertainment', 9220),
 ('love,', 8943),
 ('Fukushima2015', 8468),
 ('SanJose', 8060),
 ('MAGA,', 7963),
 ('rap', 7864),
 ('business,', 7820),
 ('BlackLivesMatter', 7690),
 ('Chicago,', 7680),
 ('crime', 7641),
 ('true', 7313),
 ('Texas,', 7191),
 ('tcot,', 6841),
 ('FukushimaAgain', 6657),
 ('PJNET', 6633),
 ('true,', 5890),
 ('Texas', 5829),
 ('money', 5818),
 ('USA', 5786),
 ('environment', 5748),
 ('ColumbianChemicals', 5666),
 ('Cleveland,', 5469),
 ('USA,', 5463),
 ('BlackLivesMatter,', 5024),
 ('baseball', 4916),
 ('SanDiego', 4874),
 ('America', 4853),
 ('StLouis', 4756),
 ('top', 4737),
 ('tcot', 4595)

# Getting list of followers of the fake accounts using account IDs

In [78]:
all_ids_temp = ira_english[['userid']].values
all_ids = set()
for id_ in all_ids_temp:
    if id_[0].isdigit():
        all_ids.add(id_[0])

In [8]:
sorted_fc = ira_english.sort_values(['follower_count', 'following_count', 'quote_count', 'reply_count', 'like_count', 'retweet_count'], ascending=False)
sorted_fc = sorted_fc.groupby(['userid', 'user_display_name'])[['follower_count', 'following_count', 'quote_count', 'reply_count', 'like_count', 'retweet_count']].mean()
sorted_fc = sorted_fc.sort_values(['follower_count', 'following_count', 'quote_count', 'reply_count', 'like_count', 'retweet_count'], ascending=False)

- Below is a truncated list of the top 100 accounts sorted by account activity (which includes number of followers, number of replies, retweets and quotes and number of people followed by the account). The table includes the account ids which we will use to retrieve a list of followers using the twitter apis, the account username, the of followers, the number of people followed by the account, the average quote count, the average reply count, the average like count and the average tweet count.
- It is evident that most of the top accounts are of Russian origin.

In [9]:
sorted_fc[:100]

Unnamed: 0_level_0,Unnamed: 1_level_0,follower_count,following_count,quote_count,reply_count,like_count,retweet_count
userid,user_display_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2527472164,Вестник Москвы,257638,544,0.166667,0.166667,9.666667,4.500000
508761973,Вестник Петербурга,149672,1024,0.000000,0.055556,1.083333,2.000000
4224729994,Tennessee,147767,74664,49.569508,60.092487,666.755435,570.817792
2808833544,Максим Дементьев,134805,2796,0.006667,0.306667,2.146667,1.060000
449689677,Рамзан Кадыров,123989,10,0.000000,2.847826,5.413043,13.695652
2648734430,Вестник Крыма,106462,386,0.142857,0.714286,1.000000,0.285714
3676820373,Вестник Красноярска,85293,316,0.000000,0.000000,1.000000,1.000000
2665564544,Мюсли Лаврова,84642,2575,0.000000,0.108696,0.239130,0.565217
2882331822,Jenna Abrams,79152,22607,2.254825,4.691032,30.377963,23.453089
4272870988,Pamela Moore,72121,42080,34.008049,40.015112,379.541721,364.332129


In [None]:
URL = 'https://api.twitter.com/1.1/followers/list.json'
for id_ in all_ids:
    PARAMS = {"userid" : id_}
    r = requests.get(url = URL, params = PARAMS)
    data = r.json()

In [34]:
def wordcloud(text):
    wordcloud = WordCloud(background_color="white", stopwords=stopwords, random_state = 42
                         ).generate(text)
    plt.figure( figsize=(20,10), facecolor='k')
    plt.imshow(wordcloud)
    plt.axis("off") 