In [None]:
#.......... for data .................
import pandas as pd
import numpy as np
import pickle
import string
from collections import Counter

#.......... for plotting ..............
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import wordcloud
from wordcloud import WordCloud, STOPWORDS
import nltk
import plotly.tools as tls
from nltk import word_tokenize

# Dataset

In [None]:
df = pd.read_pickle('balanced_tweets.pkl')
df # 418938

# 1. Text Features

In [None]:
df_features = df.copy()

In [None]:
def feature(df):
    df['word_count'] = df['cleaned_text'].apply(lambda x : len(x.split()))
    df['char_count'] = df['cleaned_text'].apply(lambda x : len(x.replace(" ","")))
    df['word_density'] = df['word_count'] / (df['char_count'] + 1)
    df['punc_count'] = df['cleaned_text_punc'].apply(lambda x : len([a for a in str(x) if a in string.punctuation]))
    df['tweet_length'] = df['cleaned_text'].apply(len)
    df['upper_count'] = df['text'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
    df['ratio_upper_lower'] = (df['upper_count'] / df['tweet_length']).round(2)
    df['hashtag_count'] = df['text'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
    df['num_unique_words'] = df['cleaned_text'].apply(lambda x: len(set(w for w in x.split())))
    df['words_vs_unique'] = df['num_unique_words'] / df['word_count']
    df["word_unique_percent"] =  df["num_unique_words"]*100/df['word_count'].round(2)
    return df

feature(df_features)

# 2. Part-of-Speech

In [None]:
# Code used from: https://towardsdatascience.com/how-i-improved-my-text-classification-model-with-feature-engineering-98fbe6c13ef3

In [None]:
lwe = df_features[df_features['label'] == 0]
ne = df_features[df_features['label'] == 1]
rwe = df_features[df_features['label'] == 2]

In [None]:
text_lwe = " ".join(lwe.cleaned_text)
text_ne = " ".join(ne.cleaned_text)
text_rwe = " ".join(rwe.cleaned_text)

In [None]:
len(text_lwe.split())

In [None]:
len(text_rwe.split())

In [None]:
len(text_lwe.split()) - len(text_rwe.split())

In [None]:
class Splitter(object):
    """
    split the document into sentences and tokenize each sentence
    """
    def __init__(self):
        self.splitter = nltk.data.load('tokenizers/punkt/english.pickle')
        self.tokenizer = nltk.tokenize.TreebankWordTokenizer()

    def split(self,text):
        # split into single sentence
        sentences = self.splitter.tokenize(text)
        # tokenization in each sentences
        tokens = [self.tokenizer.tokenize(sent) for sent in sentences]
        return tokens

In [None]:
from nltk import WordNetLemmatizer
from nltk.corpus import wordnet
class LemmatizationWithPOSTagger(object):
    def __init__(self):
        pass
    def get_wordnet_pos(self,treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

    def pos_tag(self,tokens):
        # find the pos tagging for each tokens [('What', 'WP'), ('can', 'MD'), ('I', 'PRP') ....
        pos_tokens = [nltk.pos_tag(token) for token in tokens]
    
        # lemmatization using pos tagg   
        #  [('What', 'What', ['WP']), ('can', 'can', ['MD']) --> WORD/LEMMA/POSTAG
        pos_tokens = [[(word, lemmatizer.lemmatize(word,self.get_wordnet_pos(pos_tag)),
                         pos_tag) for (word, pos_tag) in pos] for pos in pos_tokens]
        return pos_tokens

In [None]:
lemmatizer = WordNetLemmatizer()
splitter = Splitter()
lemmatization_using_pos_tagger = LemmatizationWithPOSTagger()

In [None]:
tokens = splitter.split(text_lwe)
lwe_list = lemmatization_using_pos_tagger.pos_tag(tokens)
print(lwe_list)

In [None]:
tokens = splitter.split(text_ne)
ne_list = lemmatization_using_pos_tagger.pos_tag(tokens)
#print(ne_list[:30])

In [None]:
tokens = splitter.split(text_rwe)
rwe_list = lemmatization_using_pos_tagger.pos_tag(tokens)
#print(rwe_list[:30])

In [None]:
def pos_tag_features(list_) :
    list_weight = len(' '.join(str(v[0][0]) for v in list_))
    POS_DICT = {}
    POS_DICT['Nouns'] = sum([sum(1 for words in sentence if words[2] == 'NN' or words[2] == 'NNS' or words[2] == 'NNP' or words[2] == 'NNP') 
                                   for sentence in list_])
    POS_DICT['Past'] = sum([sum(1 for words in sentence if words[2] == 'VBD') 
                                   for sentence in list_])
    POS_DICT['Superlative Adverb'] = sum([sum(1 for words in sentence if words[2] == 'RBS') # fastest
                                   for sentence in list_])
    POS_DICT['Adjectives'] = sum([sum(1 for words in sentence if words[2] == 'JJ' or words[2] == 'JJR' or words[2] == 'JJS') 
                                   for sentence in list_])
    POS_DICT['Possesive pronoun'] = sum([sum(1 for words in sentence if words[2] == 'WP$') 
                                   for sentence in list_])
    POS_DICT['Personal Pronoun'] = sum([sum(1 for words in sentence if words[2] == 'PRP') 
                                   for sentence in list_])
    POS_DICT['possesive pronoun'] = sum([sum(1 for words in sentence if words[2] == 'PRP$') 
                                   for sentence in list_])
    POS_DICT['superlative adj'] = sum([sum(1 for words in sentence if words[2] == 'JJS') # biggest
                                   for sentence in list_])
    POS_DICT['Verb'] = sum([sum(1 for words in sentence if words[2] == 'VB' or words[2] == "VBN" or words[2] == "VB" \
                                      or words[2] == "VBG" or words[2] == "VBP" or words[2] == "VBZ" or words[2] == "VBD") 
                                   for sentence in list_])
    POS_DICT["singular noun"] = sum([sum(1 for words in sentence if words[2] == 'NN') # one person
                                     for sentence in list_])/ list_weight
    POS_DICT["plural names"] = sum([sum(1 for words in sentence if words[2] == 'NNS') 
                                   for sentence in list_])
    
    return pd.DataFrame(POS_DICT, index = range(1))

## List of POS

In [None]:
lwe_df = pos_tag_features(lwe_list)
lwe_df

In [None]:
rwe_df = pos_tag_features(rwe_list)
rwe_df

In [None]:
tagged_form = [x for x in lwe_df.columns]
tagged_label = [str(x) for x in tagged_form]
tagged_dict = dict(zip(tagged_label, tagged_form))


In [None]:
for k, v in tagged_dict.items():
    print(lwe_df[k])

In [None]:
fig = tls.make_subplots(rows=1, cols=2, specs = [[{}, {}]], subplot_titles=("Verbal distribution of tweets from LWE", "Verbal distribution of tweets from RWE"))

In [None]:
for k, v in tagged_dict.items():
    tag_trace = go.Bar(x=lwe_df[v], name = str(k), text=k, textposition = 'auto', 
                       marker=dict( color='rgb(221,160,221)',line=dict(color='rgb(8,48,107)',width=1.5),),
                       opacity=0.6, showlegend=False)
    fig.append_trace(tag_trace, 1, 1)


for k,v in tagged_dict.items():
    tag_trace = go.Bar(x=rwe_df[v], name = str(k), text=k, textposition = 'auto', 
                       marker=dict( color='rgb(239, 243, 198)',line=dict(color='rgb(8,48,107)',width=1.5),),
                       opacity=0.6, showlegend=False)
    fig.append_trace(tag_trace, 1, 2)
    


fig.update_layout({
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'paper_bgcolor': 'rgba(0, 0, 0, 0)',
}, title_x=0.5)

fig['layout'].update(height=800, width=1000,
)  

fig.update_xaxes(title_text="Part of Speech Count", row=1, col=1)
fig.update_xaxes(title_text="Part of Speech Count", row=1, col=2)
# fig.update_xaxes(
#     ticktext=["200k", "400k"],
#     tickvals=[0.2,],
# )

# fig.update_layout(
#     xaxis = dict(
#         tickmode = 'array',
#         tickvals = [0.2, 0.4, 0.6, 0.8],
#         ticktext = ["200k", "400k", "600k", "800k"]
#     )
# )

fig.update_traces()

In [None]:
def pos(text):
    pos=nltk.pos_tag(word_tokenize(text))
    pos=list(map(list,zip(*pos)))[1]
    return pos

tags=lwe['cleaned_text'].apply(lambda x : pos(x))
tags=[x for l in tags for x in l]
counter=Counter(tags)

x,y=list(map(list,zip(*counter.most_common(7))))

sns.barplot(x=y,y=x)

# EDA

In [None]:
df_features['label'] = df_features.label.replace(to_replace=[0, 1, 2], value= ['LWE', 'NE', 'RWE'])
df_features['label']

## Statistics

In [None]:
## Average length of a tweet per group

In [None]:
df_features['label']

In [None]:
tweets = df_features.groupby('label')['char_count'].mean().to_frame(name="mean").round(2)

fig = px.bar(tweets, y = "mean", text="mean", labels=dict(label="Extremist groups", count=""))
fig.update_layout(title="Average number of characters per extremist group", title_x= 0.5, showlegend=False)

fig.update_layout({
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'paper_bgcolor': 'rgba(0, 0, 0, 0)',
}, title_x=0.5,
yaxis_title = "Number of Characters")

fig.update_traces(marker_color='#1f77b4')

fig.show()


In [None]:
## Average word count of a tweet with stopwords

In [None]:
df_features['word_count_stop'] = df_features['cleaned_text_punc'].apply(lambda x: len(str(x).split(" ")))

tweets = df_features.groupby(['label'])['word_count_stop'].sum().to_frame(name="sum")


fig = px.bar(tweets, y = "sum", text="sum", labels=dict(label="Extremist groups", count=""))
fig.update_layout(title="Number of words per extremist group", title_x= 0.5, showlegend=False)

fig.update_layout({
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'paper_bgcolor': 'rgba(0, 0, 0, 0)',
}, title_x=0.5,
yaxis_title = "Number of Words")

fig.update_traces(marker_color='#1f77b4')

fig.show()


In [None]:
## Average word count of a tweet

In [None]:
df_features.groupby(['label'])['word_count'].sum()

In [None]:
## Number of uppercase words compared to total words

In [None]:
tweets = df_features.groupby(['label'])['upper_count'].sum().to_frame(name="sum")
fig = px.bar(tweets, y = "sum", text="sum", labels=dict(label="Extremist groups", count=""))
fig.update_layout(title="Number of uppercase words per extremist group", title_x= 0.5, showlegend=False)

fig.update_layout({
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'paper_bgcolor': 'rgba(0, 0, 0, 0)',
}, title_x=0.5,
yaxis_title = "Number of Uppercase Words")

fig.update_traces(marker_color='#1f77b4')

fig.show()


In [None]:
ratio_upper = (df_features.groupby(['label'])['upper_count'].sum() / df_features.groupby(['label'])['word_count'].sum()).reset_index(name='count')
ratio_upper

In [None]:
## Average number of stopwords

In [None]:
df_features['stopword_count'] = df_features['word_count_stop'] - df_features['word_count']
tweets = df_features.groupby(['label'])['stopword_count'].sum().to_frame(name="sum")

fig = px.bar(tweets, y = "sum", text="sum", labels=dict(label="Extremist groups", count=""))
fig.update_layout(title="Number of stopwords per extremist group", title_x= 0.5, showlegend=False)

fig.update_layout({
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'paper_bgcolor': 'rgba(0, 0, 0, 0)',
}, title_x=0.5,
yaxis_title = "Number of Stopwords")

fig.update_traces(marker_color='#1f77b4')

fig.show()


In [None]:
## Top 20 Words in data

In [None]:
corpus = []
new = df_features['cleaned_text'].str.split()
new = new.values.tolist()
corpus = [word for i in new for word in i]

In [None]:
stop = nltk.corpus.stopwords.words("english")

counter = Counter(corpus)
most = counter.most_common()

x, y = [], []
for word,count in most[:20]:
    if (word not in stop):
        x.append(word)
        y.append(count)
        
sns.barplot(x=y,y=x)

In [None]:
## Worldcloud

In [None]:
comment_words = ''
stopwords = set(STOPWORDS)

for val in df_features['cleaned_text']:
      
    # typecaste each val to string
    val = str(val)
  
    # split the value
    tokens = val.split()
      
    # Converts each token into lowercase
    for i in range(len(tokens)):
        tokens[i] = tokens[i].lower()
      
    comment_words += " ".join(tokens)+" "

wordcloud = WordCloud(width = 800, height = 800,
                background_color ='white',
                stopwords = stopwords,
                min_font_size = 10).generate(comment_words)
  
# plot the WordCloud image                       
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.title('Top 100 Most Common Words', fontsize=50)
plt.show()

In [None]:
## Distribution of words per group

In [None]:
df_lwe = df_features.copy()
df_lwe = df_lwe[df_lwe['label'] == 0]

In [None]:
df_ne = df_features.copy()
df_ne = df_ne[df_ne['label'] == 1]

In [None]:
df_rwe = df_features.copy()
df_rwe = df_rwe[df_rwe['label'] == 2]

## Unigram Distribution

In [None]:
# unigrams distribution
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]


#### 1. LWE

In [None]:
df_lwe = df_features.copy()


In [None]:
df_lwe = df_lwe[df_lwe['label'] == 0]

In [None]:
common_words = get_top_n_words(df_lwe['cleaned_text'], 25)
# for word, freq in common_words:
#     print(word, freq)

df2 = pd.DataFrame(common_words, columns = ['cleaned_text' , 'count'])
df2.groupby('cleaned_text').sum()['count'].sort_values().iplot(
    kind='bar', xTitle='Count', yTitle='Words', linecolor='black', title='Top 25 unigrams for LWE', orientation='h')

#### 2. NE

In [None]:
df_ne = df_features.copy()
df_ne = df_ne[df_ne['label'] == 1]

In [None]:
common_words = get_top_n_words(df_ne['cleaned_text'], 25)

df3 = pd.DataFrame(common_words, columns = ['cleaned_text' , 'count'])
df3.groupby('cleaned_text').sum()['count'].sort_values().iplot(
    kind='bar', xTitle='Count', yTitle='Words', linecolor='black', title='Top 25 unigrams for NE', orientation='h')

#### 3. RWE

In [None]:
df_rwe = df_features.copy()
df_rwe = df_rwe[df_rwe['label'] == 2]

In [None]:
common_words = get_top_n_words(df_rwe['cleaned_text'], 25)

df4 = pd.DataFrame(common_words, columns = ['cleaned_text' , 'count'])
df4.groupby('cleaned_text').sum()['count'].sort_values().iplot(
    kind='bar', xTitle='Count', yTitle='Words', linecolor='black', title='Top 25 unigrams for RWE', orientation='h')


### Bigrams distribution

In [None]:
# bigrams distribution

def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]


#### 1. LWE

In [None]:
common_words = get_top_n_bigram(df_lwe['cleaned_text'], 25)
# for word, freq in common_words:
#     print(word, freq)

df5 = pd.DataFrame(common_words, columns = ['cleaned_text', 'count'])
df5.groupby(['cleaned_text']).sum()['count'].sort_values().iplot(
    kind='bar', xTitle='Count', yTitle='Words', linecolor='black', title='Top 25 bigrams for LWE', orientation='h')


#### 2. NE

In [None]:
common_words = get_top_n_bigram(df_ne['cleaned_text'], 25)

df6 = pd.DataFrame(common_words, columns = ['cleaned_text', 'count'])
df6.groupby(['cleaned_text']).sum()['count'].sort_values().iplot(
    kind='bar', xTitle='Count', yTitle='Words', linecolor='black', title='Top 25 bigrams for NE', orientation='h')


#### 3. RWE

In [None]:
# LWE
common_words = get_top_n_bigram(df_rwe['cleaned_text'], 25)
# for word, freq in common_words:
#     print(word, freq)

df7 = pd.DataFrame(common_words, columns = ['cleaned_text', 'count'])
df7.groupby(['cleaned_text']).sum()['count'].sort_values().iplot(
    kind='bar', xTitle='Count', yTitle='Words', linecolor='black', title='Top 25 bigrams for RWE', orientation='h')


### Trigrams distribution

In [None]:
def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]


#### 1. LWE

In [None]:
common_words = get_top_n_trigram(df_lwe['cleaned_text'], 25)
# for word, freq in common_words:
#     print(word, freq)

df8 = pd.DataFrame(common_words, columns = ['cleaned_text' , 'count'])
df8.groupby('cleaned_text').sum()['count'].sort_values().iplot(
    kind='bar', xTitle='Count', yTitle='Words', linecolor='black', title='Top 25 trigrams for LWE', orientation='h')


#### 2. NE

In [None]:
common_words = get_top_n_trigram(df_ne['cleaned_text'], 25)
# for word, freq in common_words:
#     print(word, freq)

df8 = pd.DataFrame(common_words, columns = ['cleaned_text' , 'count'])
df8.groupby('cleaned_text').sum()['count'].sort_values().iplot(
    kind='bar', xTitle='Count', yTitle='Words', linecolor='black', title='Top 25 trigrams for NE', orientation='h')


#### 3. RWE

In [None]:
common_words = get_top_n_trigram(df_rwe['cleaned_text'], 25)
# for word, freq in common_words:
#     print(word, freq)

df8 = pd.DataFrame(common_words, columns = ['cleaned_text' , 'count'])
df8.groupby('cleaned_text').sum()['count'].sort_values().iplot(
    kind='bar', xTitle='Count', yTitle='Words', linecolor='black', title='Top 25 trigrams for RWE', orientation='h')
