In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import html
import re
import string
import spacy
import math
#....................................

#import gensim
import texthero as hero
#from texthero import preprocessing
import wordcloud
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
#.....................................

import nltk
from nltk import word_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer


# 1. Dataset

In [None]:
df_extremists = pd.read_pickle(r'all_extremists.pkl')
df_left_extremists = pd.read_pickle(r'all_left_extremists.pkl')
df_right_extremists = pd.read_pickle(r'all_right_extremists.pkl')
df_nonex = pd.read_pickle(r'non_extremists.pkl')

In [None]:
print(df_extremists.shape) # 851860
print(df_left_extremists.shape) # 276585
print(df_right_extremists.shape) # 279831
print(df_nonex.shape) # 295444

In [None]:
print(df_right_extremists['user.screen_name'].nunique()) # 100
print(df_left_extremists['user.screen_name'].nunique()) # 100
print(df_nonex['user.screen_name'].nunique()) # 100
print(df_extremists['user.screen_name'].nunique()) # 300

In [None]:
df_extremists # 851860 × 13 

## 1.1 Create smaller dataset

User features:
- screenname, description, tweets, friends, followers, verified, listed



Tweet features:
- text, created_at, retweet count, favorite count

In [None]:
df_extreme = df_extremists[['id','label', 'user.screen_name', 'text', 'lang']] # 851860
df_extreme = pd.DataFrame(df_extreme)
df_extreme

# ex.to_csv("extremist_tweets.csv")

## 1.2 Check for empty rows and null values

In [None]:
df_extreme.loc[df_extreme['text']== '']
df_extreme.isnull().sum()

In [None]:
# Total: 851860 tweets
tweets = df_extreme.groupby('label')['text'].count().to_frame(name='count')
tweets['percentage'] = ((tweets['count'] / tweets['count'].sum()) * 100).round(1)
print(tweets)

fig = px.bar(tweets, y = "count", text="percentage", labels=dict(label="Extremist groups", count=""))
fig.update_layout(title="Percentage of tweets per group", title_x= 0.5, showlegend=False)
fig.show()

# 32.5% - 34.7% - 32.8%

## 1.4 Numeric coding labels

In [None]:
label_code = {
    'LWE': 0,
    'NE': 1,
    'RWE': 2}

df_extreme = df_extreme.replace({'label': label_code})
df_extreme

In [None]:
# df_extreme = df_extreme.reset_index(drop=True)

In [None]:
# extreme = ex[((ex['label'] == 'LWE') | (ex['label'] == 'RWE'))]
# nonextreme = ex[ex['label'] == 'NE']
# # 649701

In [None]:
# print(extreme.shape) # 0.625%
# print(nonextreme.shape) # 0.375%

# 2. Preprocessing

Steps to clean text:
1. Remove noise:
    - remove URLS + HTML tags
    - remove mentions
    - remove hashtags
    - remove numbers
    - remove emojis
    - remove punctuation
    - remove retweets
    - remove empty values


2. Deep cleaning
    - all text to lowercase
    - remove stopwords + add own stopwords
    - spell correction let's > let us etc.
    - stemming
    - lemmatization


In [None]:
df_clean = df_extreme.copy() # 851860
df_clean.shape

## Step 1. Remove noise

In [None]:
def removeEncodingCharacters(tweetColumn):
    tweetColumn.replace(r'\b[a-zA-Z]\b', '', regex=True, inplace=True)

removeEncodingCharacters(df_clean['text'])

In [None]:
df_clean['text'] = df_clean['text'].str.replace(r'[\'\",]*', '')

In [None]:
### Remove duplicates

In [None]:
df_clean.drop_duplicates(subset='text',inplace=True)
df_clean.shape

In [None]:
def removeRetweets(df):
    df = df[~df['text'].str.startswith('RT')]
    return df

df_clean = removeRetweets(df_clean)
df_clean.shape # 627429

In [None]:
def removeURLs(tweetColumn):
    tweetColumn.replace(r"(?:\@|http?\://|https?\://|www)\S+", "", regex=True, inplace=True)

def removeEmojis(tweetColumn):
    tweetColumn.replace("[(\U0001F600-\U0001F92F|\U0001F300-\U0001F5FF|\U0001F680-\U0001F6FF|\U0001F190-\U0001F1FF|\U00002702-\U000027B0|\U0001F926-\U0001FA9F|\u200d|\u2640-\u2642|\u2600-\u2B55|\u23cf|\u23e9|\u231a|\ufe0f)]", "", regex=True, inplace=True)
    
def unescapeHTML(tweetColumn):
    return tweetColumn.apply(html.unescape)

def removeMentions(tweetColumn):
    tweetColumn.replace("@[A-Za-z0-9]+", "", regex=True, inplace=True)
    
def removeNewLines(tweetColumn):
    tweetColumn.replace("(\r\n|\r|\n)", "", regex=True, inplace=True)

def removeHashtags(tweetColumn):
    tweetColumn.replace('([#])','', regex=True, inplace=True)
    
def removeNumbers(tweetColumn):
    tweetColumn.replace("[0-9]", "", regex=True, inplace=True)
    
def splitWordsInHashtag(tweetColumn):
    tweetColumn.replace('([A-Z][a-z]+)', r' \1', regex=True, inplace=True)  
    
def removeEmptyValues(df):
    # drop empty rows  
    df.replace("", np.nan, inplace=True)
    df.dropna(subset = ["cleaned_text_punc"], inplace=True)

# combine all cleaning functions together    
#.................................................................................    
def noiseCleaner(df, columnToBeCleaned):
    removeURLs(df[columnToBeCleaned])
    removeEmojis(df[columnToBeCleaned])
    df[columnToBeCleaned] = unescapeHTML(df[columnToBeCleaned])
    removeMentions(df[columnToBeCleaned])
    removeNewLines(df[columnToBeCleaned])
    removeHashtags(df[columnToBeCleaned])
    removeNumbers(df[columnToBeCleaned])
    splitWordsInHashtag(df[columnToBeCleaned])
    removeEmptyValues(df)
    
    return df

# Create new column with cleaned text
#.................................................................................
df_clean['cleaned_text_punc'] = df_clean['text']

df_clean = noiseCleaner(df_clean, "cleaned_text_punc")

df_clean.isna().sum()

In [None]:
df_clean.shape # 610862

# Step 2. Deep cleaning

In [None]:
pd.set_option('display.max_colwidth', -1)
df_clean['cleaned_text_punc'].sample(10)

### Convert to lowercase

In [None]:
def heavyCleaner(series):
    custom_pipeline = [hero.fillna,
                   hero.lowercase,
                   hero.remove_whitespace,
                   hero.remove_diacritics]
    
    return hero.clean(series, custom_pipeline)

df_clean['cleaned_text_punc'] = heavyCleaner(df_clean['cleaned_text_punc'])

### Split Contractions

In [None]:
# https://www.analyticsvidhya.com/blog/2020/04/beginners-guide-exploratory-data-analysis-text-data/

# Dictionary of English Contractions
contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not", "can't": "cannot","can't've": "cannot have",
                     "'cause": "because","could've": "could have","couldn't": "could not","couldn't've": "could not have", 
                     "didn't": "did not","doesn't": "does not", "don't": "do not","hadn't": "had not","hadn't've": "had not have",
                     "hasn't": "has not","haven't": "have not","he'd": "he would", "he'd've": "he would have","he'll": "he will", 
                     "he'll've": "he will have","how'd": "how did","how'd'y": "how do you","how'll": "how will","I'd": "I would", 
                     "I'd've": "I would have","I'll": "I will","I'll've": "I will have","I'm": "I am","I've": "I have", "isn't": "is not",
                     "it's": "it is", "it'd": "it would","it'd've": "it would have","it'll": "it will", "it'll've": "it will have", 
                     "let's": "let us","ma'am": "madam", "mayn't": "may not","might've": "might have","mightn't": "might not", 
                     "mightn't've": "might not have","must've": "must have","mustn't": "must not","mustn't've": "must not have", 
                     "needn't": "need not","needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
                     "oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not","shan't've": "shall not have",
                     "she'd": "she would","she'd've": "she would have", "she'll": "she will", 
                     "she'll've": "she will have","should've": "should have","shouldn't": "should not", "shouldn't've": "should not have",
                     "so've": "so have","that'd": "that would","that'd've": "that would have", "there'd": "there would",
                     "there'd've": "there would have", "they'd": "they would","they'd've": "they would have","they'll": "they will",
                     "they'll've": "they will have", "they're": "they are","they've": "they have","to've": "to have",
                     "today's": "today is", "wasn't": "was not","we'd": "we would","we'd've": "we would have","we'll": "we will",
                     "we'll've": "we will have", "we're": "we are","we've": "we have", "weren't": "were not","what'll": "what will",
                     "what'll've": "what will have","what're": "what are", "what've": "what have","when've": "when have","where'd": "where did", 
                     "where've": "where have","who'll": "who will","who'll've": "who will have","who've": "who have",
                     "why've": "why have","will've": "will have","won't": "will not", "won't've": "will not have", 
                     "would've": "would have","wouldn't": "would not","wouldn't've": "would not have", "w/": "with",
                     "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are",
                     "y'all've": "you all have", "you'd": "you would","you'd've": "you would have","you'll": "you will",
                     "you'll've": "you will have", "you're": "you are", "you've": "you have"}

# Regular expression for finding contractions
contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))

# Function for expanding contractions
def expand_contractions(text, contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)

# Expanding Contractions in the reviews
df_clean['cleaned_text_punc'] = df_clean['cleaned_text_punc'].apply(lambda x: expand_contractions(x))

In [None]:
df_clean['cleaned_text'] = hero.clean(df_clean['cleaned_text_punc'])

### Remove Punctuation

In [None]:
df_clean['cleaned_text'] = hero.clean(df_clean['cleaned_text_punc'])

In [None]:
# remove punctuation
def removePunctuation(tweetColumn):
    tweetColumn.replace('[^\w\s]'," ", regex=True, inplace=True)

removePunctuation(df_clean['cleaned_text'])

### Remove stopwords

In [None]:
stop = stopwords.words("english")
df_clean['cleaned_text'] = df_clean['cleaned_text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))


In [None]:
# adding stopwords
add_words = ["pm", "th", "im", "u", "w", "via", "sc", "est", "st", "dr", "cf", "iii", "ill", "en", "oh", "co", "rd",
            "nd", "et", "rt", "ca", "al", "ag", "ye", "hs", "ky", "awd", "de", "ya", "ed", "ha", "m", "hi", "con", 
             "ex", "va", "que", "mi", "pr", "yr", "fo", "ten", "se", "nh", "dhs", "wa", "wi", "as", "fl", "wv", "den"
             "yo", "wh", "e", "op", "mt", "il", "tx", "ppp", "da", "aka", "u", "ppl", "jr", "afa", "fr", "az", "mag", 
             "ou", "ar", "su", "per", "doj", "pt", "aka", "nc", "sec", "lee", "em", "ne", "pst", "lo", "ho", "po", 
             "tha", "dec", "tru", "ac", "mar", "mc", "lea", "dem", "ml", "utr", "huh", "pdx", "unm", "jax", "yup", 
             "yep", "wtf", "lol", "hey", "omg", "nah", "gqp", "yay", "img", "tbt", "fyi", "ugh", "duh", "fam", "yea", 
             "tho", "ily", "pic", "sum", "bff", "buh", "bye", "fav", "hbd", "nsr", "min", "ms", "hrs", "umm", "yum", 
             "cgi", "ai", "nt", "yas", "atl", "bhm", "irl", "fbf", "ter", "fs", "ish", "lib", "ley", "wut", "mlk", 
             "nkc", "nt", "oof", "pls", "rep", "rva", "sry", "tbh", "te", "ura", "Wan", "ah", "awe", "gon", "woo",
             "hoo", "aww", "aye", "ayo", "bbm", "dox", "ftw", "doin", "ok", "ijs", "jan", "jeb", "jen", "ff", "ep", 
             "jus", "kel", "tec", "kno", "koo", "lbs", "luh", "luv", "mtl", "cdt", "tri", "tw", "th", "fai", "fro", 
             "cst", "fm", "cou", "br", "mnt", "imm", "tnm", "int", "el", "paso", "thetnm", "cu", "wo", "si", "ope", 
             "wot", "yes", "xe", "xf", "xa", "xs", "xc", "xd", "xef", "xb", "xm", "xre", "xt", "etc", "xaa", "xba",
            "xcp", "iuic", "pttw", "tix", "uno", "xbn", "por", "los", "la", "del", "xbos", "kjv", "tldr", "xadoes", 
             "mr", "acab", "dj", "go", "live", "life", "like", "see", "want", "let", "may", "way", "say", "xve", 
             "xbe", "xll", "ni", "xbc", "xbf"]

stop_words = set(stopwords.words("english"))
stop_added = stop_words.union(add_words)

In [None]:
df_clean['cleaned_text'] = df_clean['cleaned_text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop_added))

### Remove single letters

In [None]:
# removing single stand alone letters
df_clean['cleaned_text'].replace(r"\b[a-zA-Z]\b", "", regex=True, inplace=True)

In [None]:
freq = pd.Series(' '.join(df_clean['cleaned_text']).split()).value_counts()
freq

In [None]:
## hashtag splitter
##.................................................................................................
# import nltk
# from nltk.corpus import words, brown

# word_dictionary = list(set(words.words()))

# for alphabet in "bcdefghjklmnopqrstuvwxyz":
#     word_dictionary.remove(alphabet)

# def split_hashtag_to_words_all_possibilities(hashtag):
#     all_possibilities = []
    
#     split_posibility = [hashtag[:i] in word_dictionary for i in reversed(range(len(hashtag)+1))]
#     possible_split_positions = [i for i, x in enumerate(split_posibility) if x == True]
    
#     for split_pos in possible_split_positions:
#         split_words = []
#         word_1, word_2 = hashtag[:len(hashtag)-split_pos], hashtag[len(hashtag)-split_pos:]
        
#         if word_2 in word_dictionary:
#             split_words.append(word_1)
#             split_words.append(word_2)
#             all_possibilities.append(split_words)
            
#             another_round = split_hashtag_to_words_all_possibilities(word_2)
            
#             if len(another_round) > 0:
#                 all_possibilities = all_possibilities + [[a1] + a2 for a1, a2, in zip([word_1]*len(another_round), another_round)]
#         else:
#             another_round = split_hashtag_to_words_all_possibilities(word_2)
            
#             if len(another_round) > 0:
#                 all_possibilities = all_possibilities + [[a1] + a2 for a1, a2, in zip([word_1]*len(another_round), another_round)]
                
#     return all_possibilities

# split_hashtag_to_words_all_possibilities("sheiscoming")

### Remove empty rows

In [None]:
df_clean = df_clean[df_clean['cleaned_text'].str.strip().astype(bool)]
df_clean.shape

### Remove non-english tweets

In [None]:
df_clean = df_clean[ (df_clean['lang'] == 'ca') | (df_clean['lang'] == 'cy') | 
         (df_clean['lang'] == 'da') | (df_clean['lang'] == 'de') | 
         (df_clean['lang'] == 'en') | (df_clean['lang'] == 'es') |
         (df_clean['lang'] == 'et') | (df_clean['lang'] == 'eu') |
         (df_clean['lang'] == 'in') | (df_clean['lang'] == 'it') |
         (df_clean['lang'] == 'lt') | (df_clean['lang'] == 'nl') |
         (df_clean['lang'] == 'no') | (df_clean['lang'] == 'pl') |
         (df_clean['lang'] == 'pt') | (df_clean['lang'] == 'ro') |
         (df_clean['lang'] == 'tl') | (df_clean['lang'] == 'und')]

### Reset index of dataframe

In [None]:
df_clean = df_clean.reset_index(drop=True)
df_clean.shape # 590216

In [None]:
freq = pd.Series(' '.join(df_clean['cleaned_text']).split()).value_counts()
freq[:60]

### Lemmatization

In [None]:
import spacy

#tokenizer = nltk.tokenize.WhitespaceTokenizer()
#lemmatizer = nltk.stem.WordNetLemmatizer()

# Loading model
nlp = spacy.load('en_core_web_sm',disable=['parser', 'ner'])

# def lemmatization(text):
#     return [lemmatizer.lemmatize(x) for x in tokenizer.tokenize(text)]
    
df_clean['lemmatized_text'] = df_clean['cleaned_text'].apply(lambda x: ' '.join([token.lemma_ for token in list(nlp(x)) if (token.is_stop == False)]))

### Remove single letters after lemmatization

In [None]:
df_clean['cleaned_text'] = df_clean['lemmatized_text']
df_clean['cleaned_text'].replace(r"\b[a-zA-Z]\b", "", regex=True, inplace=True)

In [None]:
freq = pd.Series(' '.join(df_clean['cleaned_text']).split()).value_counts()
freq[60:120]

### Remove empty rows

Remove all rows where avg_word_len < 3

In [None]:
def avg_word(sentence):
    words = sentence.split()
    return (sum(len(word) for word in words)/(len(words)+0.000001))

df_clean['avg_word_len'] = df_clean['cleaned_text'].apply(lambda x: avg_word(x)).round(2)
df_clean.shape 

In [None]:
df_clean = df_clean[df_clean['cleaned_text'].str.strip().astype(bool)] # 589268
df_clean.shape

### Remove rows with single words

In [None]:
df_clean = df_clean[df_clean['avg_word_len'] >= 3]
df_clean.shape

In [None]:
heavyCleaner(df_clean['cleaned_text'])

### Drop columns

In [None]:
df_final = df_clean.drop(['lemmatized_text', 'lang', 'avg_word_len'], axis=1) 

# 3. Save cleaned dataframe

## Reset index of cleaned dataframe

In [None]:
df_final = df_final.reset_index(drop=True) # 581470 rows
df_final

## Save cleaned dataframe

In [None]:
df_final.to_pickle("cleaned_tweets.pkl")

In [None]:
df_final = pd.read_pickle("cleaned_tweets.pkl")

In [None]:
# Total: 581470 tweets
tweets = df_final.groupby('label')['cleaned_text'].count().to_frame(name='count')
tweets['percentage'] = ((tweets['count'] / tweets['count'].sum()) * 100).round(1)
print(tweets)

fig = px.bar(tweets, y = "count", text="percentage", labels=dict(label="Extremist groups", count=""))
fig.update_layout(title="Percentage of tweets per group", title_x= 0.5, showlegend=False)
fig.show()

# 24%, 40.3%, 35.7%

In [None]:
df_final.groupby(['label'])['user.screen_name'].nunique()