In [1]:
import numpy as np
import nltk
import matplotlib.pyplot as plt
import scipy
import gensim
import pandas as pd
import re
import pyLDAvis
import pyLDAvis.gensim
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

### Topic Modelling for Positive Tweets

In [2]:
positive_tweets = pd.read_csv("ISIS-Positive tweets.csv")

In [43]:
def preprocess(tweet):
    # A number of the tweets start with ENGLISH TRANSLATIONS: so we will remove it 
    tweet = re.sub(r'ENGLISH TRANSLATION:', '', tweet)
    # Also strip the tweets of non-alphabetic characters except #
    tweet = re.sub(r'[^A-Za-z# ]', '', tweet)
    # Remove retweet indicators
    tweet = re.sub(r'^[Rr][Tt]\s@\w+: ', '', tweet)
    tweet = re.sub(r'^[Rr][Tt]\s+', '', tweet)
    # Remove X (Twitter) handles
    tweet = re.sub(r'@\w+', '', tweet) 
    # Remove URLs
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet)
    # Remove 'amp'
    tweet = re.sub(r'amp', '', tweet)
    
    words = tweet.strip().split()
  
    hashtags = [word for word in words if re.match(r'#', word) != None]
    words = [word.lower() for word in words if word not in hashtags]

    for hashtag in hashtags:
        hashtag = re.sub(r'#', '', hashtag)
        words_tag = []
        current_word = ''
        for a in hashtag:
            if a.isupper() and current_word != '':
                words_tag.append(current_word)
                current_word = '' + a.lower()
            else:
                current_word = current_word + a.lower()
        words_tag.append(current_word)
        words.extend(words_tag)

    # Option 1: Remove stopwords and stem words using porter stemmer
    # p_stem = PorterStemmer()
    # words = [p_stem.stem(word.lower()) for word in words if word not in stopwords.words('english')]

    # Option 2: Remove stopwords and lemmatize words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word.lower()) for word in words if word.lower() not in stopwords.words('english')]
    
    # Remove duplicates
    words = list(set(words))

    #Removing high frequency custom stop words
    words_to_remove = ['al', 'u', 'im', 'pt', 'g', 'k', 'n', 'e', 'b', 'f', 'p', 'abu', 'de', 'la', 'un', 'je', 'il', 'et', '', 'pa', 'c', 'cest', 'le', 'du', 'que', 'sa', 'di', 'tu', 'dans', 'une', 
                      'avec', 'qui', 'en', 'ce', 'va', 'est']
    words = [word for word in words if word not in words_to_remove]

    #Correcting words
    corrections = {
    'isi': 'isis',
    'allh': 'allah',
    'jihd': 'jihad'
}

    # Using list comprehension to replace misspelled words
    words = [corrections.get(word, word) for word in words]

    return words

In [4]:
positive_tweet_wordlist = [preprocess(tweet) for tweet in positive_tweets['tweets']]

In [5]:
dictionary = gensim.corpora.Dictionary(positive_tweet_wordlist)
dictionary
bow = [dictionary.doc2bow(line) for line in positive_tweet_wordlist]

In [6]:
def test_eta(eta, dictionary, corp, txt, ntopics, print_topics=True, print_dist=True):
    np.random.seed(42) # set the random seed for repeatability
    bow = [dictionary.doc2bow(line) for line in corp] # get the bow-format lines with the set dictionary
    with (np.errstate(divide='ignore')):  # ignore divide-by-zero warnings
        model = gensim.models.ldamodel.LdaModel(
            corpus=bow, id2word=dictionary, num_topics=ntopics,
            random_state=42, chunksize=100, eta=eta,
            eval_every=-1, update_every=1,
            passes=150, alpha='auto', per_word_topics=True)
    # visuzlize the model term topics
    print('Perplexity: {:.2f}'.format(model.log_perplexity(bow)))
    if print_topics:
        # display the top terms for each topic
        for topic in range(ntopics):
            print('Topic {}: {}'.format(topic, [dictionary[w] for w,p in model.get_topic_terms(topic, topn=10)]))
    # if print_dist:
        # display the topic probabilities for each document
        # for line,bag in zip(txt,bow):
        #     doc_topics = ['({}, {:.1%})'.format(topic, prob) for topic,prob in model.get_document_topics(bag)]
        #     print('{} {}'.format(line, doc_topics))
    return model

In [7]:
# Function to visualize the LDA model using pyLDAvis
def visualize_lda(model, bow, dictionary, output_file):
    pyLDAvis.enable_notebook()
    vis_data = pyLDAvis.gensim.prepare(model, bow, dictionary)
    pyLDAvis.save_html(vis_data, output_file)
    pyLDAvis.display(vis_data)

#### Unsupervised Topic Modelling

In [8]:
eta = test_eta('auto', dictionary, positive_tweet_wordlist, positive_tweets['tweets'], ntopics=6)
bow = [dictionary.doc2bow(line) for line in positive_tweet_wordlist]
visualize_lda(eta, bow, dictionary, 'positive_tweets_unsupervisedlda.html')

Perplexity: -8.02
Topic 0: ['alhamdulillah', 'isis', 'people', 'allah', 'time', 'blessing', 'back', 'jazak', 'celebrate', 'muhammad']
Topic 1: ['allah', 'love', 'thanks', 'one', 'welcome', 'best', 'interesting', 'thank', 'syria', 'quran']
Topic 2: ['allah', 'may', 'beautiful', 'islamic', 'day', 'state', 'make', 'pleased', 'funny', 'reward']
Topic 3: ['yes', 'get', 'true', 'please', 'happy', 'make', 'map', 'upon', 'nidalgazaui', 'word']
Topic 4: ['amazing', 'back', 'better', 'muslim', 'support', 'sparksofirhabi', 'love', 'also', 'even', 'nice']
Topic 5: ['good', 'great', 'see', 'allah', 'video', 'like', 'aleppo', 'unclesamcoco', 'sheikh', 'victory']


#### Semi-Supervised Topic Modelling

In [9]:
def create_eta(priors, etadict, ntopics):
    eta = np.full(shape=(ntopics, len(etadict)), fill_value=1) # create a (ntopics, nterms) matrix and fill with 1
    for word, topic in priors.items(): # for each word in the list of priors
        keyindex = [index for index,term in etadict.items() if term==word] # look up the word in the dictionary
        if (len(keyindex)>0): # if it's in the dictionary
            eta[topic,keyindex[0]] = 1e7  # put a large number in there
    eta = np.divide(eta, eta.sum(axis=0)) # normalize so that the probabilities sum to 1 over all topics
    return eta

In [15]:
apriori_original = {
    'love': 0, 'good': 0, 'great': 0, 'best': 0, 'amazing': 0,
    'victory': 1, 'alhamdulillah': 1, 'interesting': 1, 'support': 1, 'one': 1,
    'help': 2, 'beautiful': 2, 'better': 2, 'thanks': 2, 'day': 2,
    'back': 3, 'new': 3, 'video': 3, 'iraq': 3, 'state': 3
}
eta = create_eta(apriori_original, dictionary, 5)
eta = test_eta(eta, dictionary, positive_tweet_wordlist, positive_tweets['tweets'], ntopics=5)
visualize_lda(eta, bow, dictionary, 'positive_tweets_supervisedlda.html')

Perplexity: 6.71
Topic 0: ['allah', 'people', 'great', 'yes', 'time', 'love', 'alhamdulillah', 'please', 'may', 'sparksofirhabi']
Topic 1: ['good', 'one', 'thanks', 'welcome', 'syria', 'better', 'think', 'indeed', 'cool', 'keep']
Topic 2: ['allah', 'may', 'beautiful', 'day', 'see', 'islamic', 'state', 'make', 'victory', 'pleased']
Topic 3: ['good', 'best', 'isis', 'video', 'thank', 'week', 'iraq', 'came', 'great', 'rts']
Topic 4: ['love', 'back', 'amazing', 'muslim', 'allah', 'support', 'akhi', 'new', 'interesting', 'even']


In [16]:
negative_tweets = pd.read_csv("ISIS-Negative tweets.csv")

### Topic Modelling for Negative Tweets

In [17]:
negative_tweet_wordlist = [preprocess(tweet) for tweet in negative_tweets['tweets']]

In [14]:
dictionary = gensim.corpora.Dictionary(negative_tweet_wordlist)
dictionary
bow = [dictionary.doc2bow(line) for line in negative_tweet_wordlist]

#### Unsupervised LDA

In [16]:
eta = test_eta('auto', dictionary, negative_tweet_wordlist, negative_tweets['tweets'], ntopics=4)
visualize_lda(eta, bow, dictionary, 'negative_tweets_unsupervisedlda.html')

Perplexity: -8.41
Topic 0: ['killed', 'syria', 'army', 'iraqi', 'soldier', 'amaq', 'agency', 'force', 'near', 'breaking']
Topic 1: ['rebel', 'village', 'killing', 'warreporter', 'police', 'terrorist', 'apostate', 'ypg', 'coalition', 'body']
Topic 2: ['people', 'scotsmaninfidel', 'elevn', 'texanna', 'bombing', 'sassysassyred', 'spicylatte', 'death', 'like', 'allah']
Topic 3: ['islamic', 'state', 'today', 'attack', 'aleppo', 'isis', 'airstrikes', 'area', 'militant', 'shiite']


#### Supervised LDA

In [47]:
apriori_original = {
    'killed': 0, 'syria': 0, 'army': 0, 'bombing': 0, 'fighters': 0,
    'allah': 1, 'jihad': 1, 'faith': 1, 'martyr': 1, 'sharia': 1,
    'us': 2, 'west': 2, 'aleppo': 2, 'iraqi': 2, 'policy': 2
}
eta = create_eta(apriori_original, dictionary, 3)
eta = test_eta(eta, dictionary, negative_tweet_wordlist, negative_tweets['tweets'], ntopics=3)
visualize_lda(eta, bow, dictionary, 'negative_tweets_supervisedlda.html')

Perplexity: -2.09
Topic 0: ['people', 'muslim', 'warreporter', 'bombing', 'allah', 'terrorist', 'dont', 'layer', 'take', 'know']
Topic 1: ['scotsmaninfidel', 'texanna', 'elevn', 'spicylatte', 'sassysassyred', 'like', 'pig', 'kafirkaty', 'saudi', 'year']
Topic 2: ['killed', 'syria', 'army', 'iraqi', 'soldier', 'amaq', 'agency', 'islamic', 'state', 'today']


### Topic Modelling for Neutral Tweets

In [24]:
neutral_tweets = pd.read_csv("ISIS-Neutral tweets.csv")

In [44]:
neutral_tweet_wordlist = [preprocess(tweet) for tweet in neutral_tweets['tweets']]

In [45]:
dictionary = gensim.corpora.Dictionary(neutral_tweet_wordlist)
dictionary
bow = [dictionary.doc2bow(line) for line in neutral_tweet_wordlist]

#### Unsupervised LDA

In [41]:
eta = test_eta('auto', dictionary, neutral_tweet_wordlist, neutral_tweets['tweets'], ntopics=4)
visualize_lda(eta, bow, dictionary, 'neutral_tweets_unsupervisedlda.html')

Perplexity: -9.35
Topic 0: ['allah', 'homs', 'muslim', 'mosul', 'sparksofirhabi', 'scotsmaninfidel', 'time', 'support', 'spicylatte', 'advance']
Topic 1: ['wilayat', 'isis', 'northern', 'new', 'say', 'warreporter', 'libya', 'regime', 'palmyra', 'people']
Topic 2: ['syria', 'agency', 'amaq', 'breaking', 'army', 'aleppo', 'force', 'fighter', 'near', 'iraq']
Topic 3: ['islamic', 'state', 'city', 'control', 'area', 'village', 'day', 'fight', 'group', 'one']


#### Supervised LDA

In [46]:
apriori_original = {
    'islamic': 0, 'city': 0, 'video': 0, 'near': 0, 'attack': 0,
    'city': 1, 'new': 1, 'ramiallolah': 1, 'one': 1,
    'fighters': 2, 'rebels': 2, 'forces': 2, 'attack': 2 
}
eta = create_eta(apriori_original, dictionary, 3)
eta = test_eta(eta, dictionary, neutral_tweet_wordlist, neutral_tweets['tweets'], ntopics=3)
visualize_lda(eta, bow, dictionary, 'neutral_tweets_supervisedlda.html')

Perplexity: -5.26
Topic 0: ['islamic', 'state', 'syria', 'wilayat', 'agency', 'amaq', 'breaking', 'army', 'aleppo', 'force']
Topic 1: ['isis', 'ramiallolah', 'new', 'say', 'muslim', 'warreporter', 'fight', 'one', 'sparksofirhabi', 'people']
Topic 2: ['allah', 'scotsmaninfidel', 'spicylatte', 'elevn', 'sassysassyred', 'texanna', 'may', 'kafirkaty', 'back', 'peigneacheveux']
