# NaziAnalyzer
This notebook will analyze a dataset of Nazi profiles and tweets.

In [128]:
import json, re
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
stop_words.update(set(stopwords.words('spanish')))

In [136]:
more_stop_words = {'via', 'like', 'one', 'us', 'it\'s', 'get', 'don\'t', 
                   'would', 'want', '=', 'i\'m', 'back', 'many', 'know', 
                  'new', 'time', 'see', 'go', 'think', 'today', 'say', 'day', 
                  'you\'re', 'going', 'well', 'must', 'really', 'u', 'really', 
                  'lol', 'much', 'still', 'look', 'that\'s', 'even', 'last', 'way'}
stop_words.update(more_stop_words)
len(stop_words)

495

In [125]:
def remove_ats(string):
    # Remove any @s (e.g., replies) from a string.
    return re.sub(r'(@\S+\s)', "", string)

def remove_urls(string):
    # Remove URLs from a string.
    return re.sub(r'(https?://\S*)', "", string)

def remove_punctuation(string):
    # Remove punctuation from a string.
    string = string.replace('+', ' ').replace('.', ' ').replace(',', ' ').replace(':', ' ')
    string = string.replace('(', ' ').replace(')', ' ').replace('\[', ' ').replace('\]', ' ')
    string = string.replace('\{', ' ').replace('\}', ' ').replace('\\', ' ').replace('/', ' ')
    string = string.replace(' -', ' ').replace('…', ' ').replace('\"', ' ').replace('&amp;', ' ')
    string = string.replace('?', ' ').replace('!', ' ').replace('“', ' ').replace('”', ' ')
    string = string.replace('   ', ' ').replace('  ', ' ')
    string = string.replace('\n', ' ')
    return string

def test_1488(string):
    # Detect 1488 and append the text "FourteenEightyEight"
    #
    # Match 14 and 88 only if no digit precedes nor follows.
    # 14 and 88 may be separated by a single non-digit.
    # e.g., matches: 1488, 14/88, 14.88, asdf1488jkl, 13-14-88a, a14 88b.
    # e.g., non-matches: 14288, 14--88, 5551488555, 714/88.
    if re.match(r".*(?<!\d)14[\D]?88(?!\d)", string):
        string += " FourteenEightyEight"
        return string
    else:
        return string

def remove_digits(string):
    # Remove extraneous digits
    string = re.sub(r'\s(\d+)\W', ' ', string)
    return string

def clean_tweet(string):
    # Detect 1488, remove digits, remove @s and URLs, and remove punctuation from a string.
    string = test_1488(string)
    string = remove_digits(string)
    string = remove_ats(string)
    string = remove_urls(string)
    string = remove_punctuation(string)
    return string

def get_full_text(status):
    # Get the full text from a status object.
    try:
        if 'retweeted_status' in status:
            return status['retweeted_status']['full_text']
        elif 'full_text' in status:
            return status['full_text']
        elif 'text' in status:
            return status['text']
        else:
            print("Error in get_full_text(" + str(status['id']) + ")")
            return ""
    except BaseException as e:
        print("Error in get_full_text(" + str(status['id']) + "): ", e)
        return ""

In [140]:
json_data=open("data/nazi_tweets-2017-11-14-13_41_56.json").read()
#json_data=open("data/nazi_tweets-2017-11-14-12_27_33.json").read()
#json_data=open("data/nazi_tweets-2017-11-13-17_22_52.json").read()
#json_data=open("tweets-test-PaleoconWoman.json").read()
#json_data = open("tweet.json").read()
data = json.loads(json_data)

tweet_data = [get_full_text(status) for status in data]
clean_data = [clean_tweet(tweet) for tweet in tweet_data]
print(len(clean_data))

114240


In [None]:
for i in range(40,80):
    print(clean_data[i])

In [141]:
word_tokens = []

for status in clean_data:
    # Tokenize words, remove empty words, and lowercase all words.
    new_tokens = [word.lower() for word in status.split(' ') if word != '' and word != ' ']
    word_tokens.extend(new_tokens)

filtered_words = []
for w in word_tokens:
    if w not in stop_words:
        filtered_words.append(w)    

fdist = nltk.FreqDist(filtered_words)

for word, frequency in fdist.most_common(100):
    print(u'{};{}'.format(word, frequency))

white;7263
people;5937
trump;2393
right;2188
good;2117
world;1820
never;1753
man;1706
need;1693
jews;1666
black;1663
race;1542
country;1521
great;1501
make;1488
whites;1464
america;1376
video;1361
hate;1333
take;1313
#maga;1267
stop;1246
love;1230
women;1224
war;1224
#whitegenocide;1179
media;1174
can't;1134
left;1127
please;1080
president;1074
real;1046
years;1038
every;1031
nothing;1011
racist;1002
followers;1001
men;988
first;985
fuck;982
national;953
always;931
thing;928
live;928
jewish;922
got;918
shit;918
also;911
true;903
children;899
yes;898
europe;886
said;883
keep;880
ever;877
fourteeneightyeight;876
free;867
news;867
muslim;859
could;850
let;843
another;839
twitter;838
truth;838
hitler;830
@youtube;827
come;820
antifa;812
history;811
year;809
die;808
support;807
fight;807
work;798
thank;795
american;784
believe;784
life;783
god;773
better;771
made;767
old;754
jew;753
oh;748
muslims;748
help;746
call;745
police;744
says;730
wrong;728
give;716
bad;715
doesn't;713
fucking;713
h

In [143]:
print(str(len(clean_data)))
print(str(len(filtered_words)))

114240
829145


In [149]:
fdist['words']

373