# Capstone Project: Social media sentiment analysis

## Part 3c: Topic Modeling using Biterm

In [1]:
# Download biterm model from following github
#!pip install git+git://github.com/markoarnauto/biterm.git

In [8]:
import pandas as pd
import numpy as np
import re

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import stop_words

from biterm.btm import oBTM
from biterm.utility import vec_to_biterms, topic_summuary

In [22]:
# Call data from Part 2
tweet_combined_clean=pd.read_csv('./dataset/tweet_combined_clean_v1.csv')

In [5]:
# Unlike VADER, topic modeling requires extensive data cleaning - Edit stopwords to exclude emotional words like 'like','love' and brand names 'Samsung', 'Apple', 'Huawei'

lemm = WordNetLemmatizer()

def text_processer(text):
    
    # 1. Remove HTML.
    review_text = BeautifulSoup(text).get_text()
    
    # 2. Remove non-letters.
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    
    # 3. Convert to lower case, split into individual words.
    words = letters_only.lower().split()
        
    # 4. Remove stop words
    stop_words = set(stopwords.words("english"))
    stop_words.update(['samsung','huawei','apple','http','like','really','want','good','say','love','better','much',
                       'day','lol','well','need','could','take','twt','add','maxwinebach','nhlblackhawk','smtshepossa',
                       'niantichelp','recognised','view','great','doh','samsungmobile','okay','check','feel','always',
                       'yes','stop','even','every','already','u','something','go','see','sure','shit','said','https','com','www','hi','please','co','thanks','one',
                       'think','got','also', 'make','know','use', 'would','get','look','never','still','mtshepossa','akinjoshua',
                       'pay','using','time','b','c','d','e','f','g','h','i','ksvaljek', 'right', 'used','godissfroot','nhlblackhawks','http',
                       'j','k','l','n','m','o','p','q','r','s','t','u','v','w','x','y','z', 'sorry', 'part', 'u', 'let','as', 'saying', 'bit', 
                       'update', 'techquotesdaily','oh', 'yeah','frecowang', 'bts', 'pak','ok','fuck','come','thing','south','settle','level', 
                       'took','actually','stand','im','watch', 'jezdez','offby','dirtytesla','universeice','as', 'thank', 'thanks', 'seem', 'seems',
                        'way','put','made','thought', 'jack'])
    filtered_words=[w for w in words if not w in stop_words]

    # 5. Lemmatize words.
    lemmed_words = [lemm.lemmatize(i) for i in filtered_words]
    
    return (" ".join(lemmed_words))

In [6]:
# Initialise both types of vectorizer for comparison
cv = CountVectorizer(analyzer='word',       
                             min_df=10,
                            token_pattern='[a-zA-Z0-9]{3,}') #set number of characters to be more than 3 per word

## Samsung

In [32]:
tweet_samsung_clean=tweet_combined_clean[tweet_combined_clean['brand']==0]
tweet_samsung_words=tweet_samsung_clean['text'].apply(text_processer)
samsung_cv=cv.fit_transform(tweet_samsung_words).toarray()

In [33]:
vocab = np.array(cv.get_feature_names())
biterms = vec_to_biterms(samsung_cv)

btm = oBTM(num_topics=2, V=vocab)
for i in range(0, len(biterms), 100): # prozess chunk of 200 texts
    biterms_chunk = biterms[i:i + 100]
    btm.fit(biterms_chunk, iterations=50)
    topics = btm.transform(biterms)
    
print("\n\n Topic coherence ..")
topic_summuary(btm.phi_wz.T, samsung_cv, vocab, 15)

100%|██████████| 50/50 [00:00<00:00, 50.04it/s]
100%|██████████| 50/50 [00:01<00:00, 35.78it/s]
100%|██████████| 50/50 [00:00<00:00, 53.45it/s]
100%|██████████| 50/50 [00:01<00:00, 42.31it/s]
100%|██████████| 50/50 [00:01<00:00, 43.53it/s]
100%|██████████| 50/50 [00:01<00:00, 42.74it/s]
100%|██████████| 50/50 [00:01<00:00, 30.82it/s]
100%|██████████| 50/50 [00:01<00:00, 40.01it/s]
100%|██████████| 50/50 [00:00<00:00, 55.67it/s]
100%|██████████| 50/50 [00:01<00:00, 46.78it/s]
100%|██████████| 50/50 [00:01<00:00, 46.10it/s]
100%|██████████| 50/50 [00:00<00:00, 50.95it/s]
100%|██████████| 50/50 [00:01<00:00, 42.69it/s]
100%|██████████| 50/50 [00:00<00:00, 55.09it/s]
100%|██████████| 50/50 [00:00<00:00, 52.48it/s]
100%|██████████| 50/50 [00:01<00:00, 39.70it/s]
100%|██████████| 50/50 [00:01<00:00, 29.46it/s]
100%|██████████| 50/50 [00:01<00:00, 41.13it/s]
100%|██████████| 50/50 [00:01<00:00, 33.42it/s]
100%|██████████| 50/50 [00:01<00:00, 40.04it/s]
100%|██████████| 50/50 [00:01<00:00, 42.



 Topic coherence ..
Topic 0 | Coherence=-260.11 | Top words= samsungmobileus samsunguk huaweiuk huaweimobileuk zoneoftech prob zareldo pro phone galaxy brand samsungus shot channel iphone
Topic 1 | Coherence=-365.49 | Top words= phone galaxy iphone note android app new oneplus plus last camera smart mobile screen year
Average topic coherence for the top words is -312.80393944350476


{'coherence': [-260.11309567426633, -365.49478321274324],
 'top_words': [array(['samsungmobileus', 'samsunguk', 'huaweiuk', 'huaweimobileuk',
         'zoneoftech', 'prob', 'zareldo', 'pro', 'phone', 'galaxy', 'brand',
         'samsungus', 'shot', 'channel', 'iphone'], dtype='<U15'),
  array(['phone', 'galaxy', 'iphone', 'note', 'android', 'app', 'new',
         'oneplus', 'plus', 'last', 'camera', 'smart', 'mobile', 'screen',
         'year'], dtype='<U15')]}

In [34]:
count_0=0
for i in range(len(tweet_samsung_words)):
    if topics[i].argmax()==0:
        count_0+=1
print('Number of tweets under Topic 0: ', count_0)
count_1=0
for i in range(len(tweet_samsung_words)):
    if topics[i].argmax()==1:
        count_1+=1
print('Number of tweets under Topic 1: ', count_1)

Number of tweets under Topic 0:  1365
Number of tweets under Topic 1:  1073


## Apple

In [35]:
tweet_apple_clean=tweet_combined_clean[tweet_combined_clean['brand']==1]
tweet_apple_words=tweet_apple_clean['text'].apply(text_processer)
apple_vectorized=cv.fit_transform(tweet_apple_words).toarray()

In [36]:
vocab = np.array(cv.get_feature_names())
biterms = vec_to_biterms(apple_vectorized)

btm = oBTM(num_topics=2, V=vocab)
for i in range(0, len(biterms), 100): # prozess chunk of 200 texts
    biterms_chunk = biterms[i:i + 100]
    btm.fit(biterms_chunk, iterations=50)
    topics = btm.transform(biterms)
    
print("\n\n Topic coherence ..")
topic_summuary(btm.phi_wz.T, apple_vectorized, vocab, 15)

100%|██████████| 50/50 [00:00<00:00, 85.12it/s]
100%|██████████| 50/50 [00:00<00:00, 57.71it/s]
100%|██████████| 50/50 [00:00<00:00, 75.70it/s]
100%|██████████| 50/50 [00:00<00:00, 83.56it/s]
100%|██████████| 50/50 [00:00<00:00, 100.07it/s]
100%|██████████| 50/50 [00:00<00:00, 86.73it/s]
100%|██████████| 50/50 [00:00<00:00, 111.65it/s]
100%|██████████| 50/50 [00:00<00:00, 90.66it/s]
100%|██████████| 50/50 [00:00<00:00, 115.51it/s]
100%|██████████| 50/50 [00:00<00:00, 109.94it/s]
100%|██████████| 50/50 [00:00<00:00, 89.69it/s]
100%|██████████| 50/50 [00:00<00:00, 59.90it/s]
100%|██████████| 50/50 [00:00<00:00, 66.67it/s]
100%|██████████| 50/50 [00:00<00:00, 86.89it/s]
100%|██████████| 50/50 [00:00<00:00, 61.59it/s]
100%|██████████| 50/50 [00:00<00:00, 97.34it/s] 
100%|██████████| 50/50 [00:00<00:00, 116.32it/s]
100%|██████████| 50/50 [00:00<00:00, 121.98it/s]
100%|██████████| 50/50 [00:00<00:00, 113.68it/s]
100%|██████████| 50/50 [00:00<00:00, 156.67it/s]
100%|██████████| 50/50 [00:00<0



 Topic coherence ..
Topic 0 | Coherence=-330.54 | Top words= music microsoft help app download store iamtbotouch give iphone tim google podcast cook pro itunes
Topic 1 | Coherence=-340.13 | Top words= music spotify song app year phone back playlist tidal new video first store last lot
Average topic coherence for the top words is -335.3335452035808


{'coherence': [-330.53707906334904, -340.1300113438126],
 'top_words': [array(['music', 'microsoft', 'help', 'app', 'download', 'store',
         'iamtbotouch', 'give', 'iphone', 'tim', 'google', 'podcast',
         'cook', 'pro', 'itunes'], dtype='<U15'),
  array(['music', 'spotify', 'song', 'app', 'year', 'phone', 'back',
         'playlist', 'tidal', 'new', 'video', 'first', 'store', 'last',
         'lot'], dtype='<U15')]}

In [37]:
count_0=0
for i in range(len(tweet_apple_words)):
    if topics[i].argmax()==0:
        count_0+=1
print('Number of tweets under Topic 0: ', count_0)
count_1=0
for i in range(len(tweet_apple_words)):
    if topics[i].argmax()==1:
        count_1+=1
print('Number of tweets under Topic 1: ', count_1)

Number of tweets under Topic 0:  1726
Number of tweets under Topic 1:  485


## Huawei

In [23]:
tweet_huawei_clean=tweet_combined_clean[tweet_combined_clean['brand']==2]
tweet_huawei_words=tweet_huawei_clean['text'].apply(text_processer)
huawei_vectorized=cv.fit_transform(tweet_huawei_words).toarray()

In [28]:
vocab = np.array(cv.get_feature_names())
biterms = vec_to_biterms(huawei_vectorized)

btm = oBTM(num_topics=2, V=vocab)
for i in range(0, len(biterms), 100): # prozess chunk of 200 texts
    biterms_chunk = biterms[i:i + 100]
    btm.fit(biterms_chunk, iterations=50)
    topics = btm.transform(biterms)
    
print("\n\n Topic coherence ..")
topic_summuary(btm.phi_wz.T, huawei_vectorized, vocab, 15)

100%|██████████| 50/50 [00:01<00:00, 29.95it/s]
100%|██████████| 50/50 [00:01<00:00, 30.49it/s]
100%|██████████| 50/50 [00:02<00:00, 24.44it/s]
100%|██████████| 50/50 [00:01<00:00, 33.53it/s]
100%|██████████| 50/50 [00:02<00:00, 17.89it/s]
100%|██████████| 50/50 [00:05<00:00,  8.60it/s]
100%|██████████| 50/50 [00:01<00:00, 29.68it/s]
100%|██████████| 50/50 [00:01<00:00, 26.63it/s]
100%|██████████| 50/50 [00:01<00:00, 35.38it/s]
100%|██████████| 50/50 [00:01<00:00, 43.87it/s]
100%|██████████| 50/50 [00:02<00:00, 21.71it/s]
100%|██████████| 50/50 [00:02<00:00, 22.33it/s]
100%|██████████| 50/50 [00:01<00:00, 26.52it/s]
100%|██████████| 50/50 [00:01<00:00, 26.68it/s]
100%|██████████| 50/50 [00:01<00:00, 29.69it/s]
100%|██████████| 50/50 [00:02<00:00, 24.16it/s]
100%|██████████| 50/50 [00:01<00:00, 28.55it/s]
100%|██████████| 50/50 [00:01<00:00, 35.33it/s]
100%|██████████| 50/50 [00:02<00:00, 18.83it/s]
100%|██████████| 50/50 [00:01<00:00, 30.00it/s]
100%|██████████| 50/50 [00:01<00:00, 26.



 Topic coherence ..
Topic 0 | Coherence=-262.76 | Top words= china ccp chinese red claw connection dragon expose focus globalnews dmhdpxfuci ttsampaio mask communist chinadaily
Topic 1 | Coherence=-374.29 | Top words= china company chinese canada europe people mask leeszla world pro globalnews phone ccp mike psyberchic
Average topic coherence for the top words is -318.529739420865


{'coherence': [-262.76479176544495, -374.2946870762851],
 'top_words': [array(['china', 'ccp', 'chinese', 'red', 'claw', 'connection', 'dragon',
         'expose', 'focus', 'globalnews', 'dmhdpxfuci', 'ttsampaio', 'mask',
         'communist', 'chinadaily'], dtype='<U15'),
  array(['china', 'company', 'chinese', 'canada', 'europe', 'people',
         'mask', 'leeszla', 'world', 'pro', 'globalnews', 'phone', 'ccp',
         'mike', 'psyberchic'], dtype='<U15')]}

In [29]:
count_0=0
for i in range(len(tweet_huawei_words)):
    if topics[i].argmax()==0:
        count_0+=1
print('Number of tweets under Topic 0: ', count_0)
count_1=0
for i in range(len(tweet_huawei_words)):
    if topics[i].argmax()==1:
        count_1+=1
print('Number of tweets under Topic 1: ', count_1)

Number of tweets under Topic 0:  1198
Number of tweets under Topic 1:  1206
