# Capstone Project: Social media sentiment analysis 
## Part 4: Topic Modeling

In [5]:
# Import libraries
import requests
import json
import pandas as pd
import numpy as np
import time
import random
import re
import csv

from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas.util.testing as tm
import spacy

# Gensim libraries
from gensim import corpora
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.model_selection import GridSearchCV
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.wrappers import LdaMallet
import pyLDAvis.gensim
from gensim.models import CoherenceModel, Word2Vec, LsiModel, KeyedVectors, fasttext,LdaModel
from gensim import matutils
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

# Enable logging for gensim - optional but important
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline



# Preprocess Twitter comments for topic modeling

In [6]:
# Call data from Part 2
tweet_combined_clean=pd.read_csv('./dataset/tweet_combined_clean_v1.csv')
samsung_tweet_neg=pd.read_csv('./dataset/samsung_tweet_neg.csv')
samsung_tweet_pos=pd.read_csv('./dataset/samsung_tweet_pos.csv')

In [7]:
# Unlike VADER, topic modeling requires extensive data cleaning - Edit stopwords to exclude emotional words like 'like','love' and brand names 'Samsung', 'Apple', 'Huawei'

lemm = WordNetLemmatizer()

def text_processer(text):
    
    # 1. Remove HTML.
    review_text = BeautifulSoup(text).get_text()
    
    # 2. Remove non-letters.
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    
    # 3. Convert to lower case, split into individual words.
    words = letters_only.lower().split()
        
    # 4. Remove stop words
    stop_words = set(stopwords.words("english"))
    stop_words.update(['samsung','huawei','apple','http','like','really','want','good','say','love','better','much',
                       'day','lol','well','need','could','take','twt','add','maxwinebach','nhlblackhawk','smtshepossa',
                       'niantichelp','recognised','view','great','doh','samsungmobile','okay','check','feel','always',
                       'yes','stop','even','every','already','u','something','go','see','sure','shit','said','https','com','www','hi','please','co','thanks','one',
                       'think','got','also', 'make','know','use', 'would','get','look','never','still','mtshepossa','akinjoshua',
                       'pay','using','time','b','c','d','e','f','g','h','i','ksvaljek', 'right', 'used','godissfroot','nhlblackhawks','http',
                       'j','k','l','n','m','o','p','q','r','s','t','u','v','w','x','y','z', 'sorry', 'part', 'u', 'let','as', 'saying', 'bit', 
                       'update', 'techquotesdaily','oh', 'yeah','frecowang', 'bts', 'pak','ok','fuck','come','thing','south','settle','level', 
                       'took','actually','stand','im','watch', 'jezdez','offby','dirtytesla','universeice','as', 'thank', 'thanks', 'seem', 'seems',
                        'way','put','made','thought', 'jack'])
    filtered_words=[w for w in words if not w in stop_words]

    # 5. Lemmatize words.
    lemmed_words = [lemm.lemmatize(i) for i in filtered_words]
    
    return (" ".join(lemmed_words))

## Gibbs Sampling Dirichlet Mixture Model 

The Gibbs Sampling Dirichlet Mixture Model (GSDMM) is an “altered” LDA algorithm, showing great results on STTM tasks, that makes the initial assumption: 1 topic ↔️1 document. The words within a document are generated using the same unique topic, and not from a mixture of topics as it was in the original LDA.

reference:https://towardsdatascience.com/short-text-topic-modeling-70e50a57c883

In [9]:
from gsdmm.gsdmm import MovieGroupProcess
from collections import Counter

### Samsung

In [22]:
tweet_samsung_clean=tweet_combined_clean[tweet_combined_clean['brand']==0]
tweet_samsung_words=tweet_samsung_clean['text'].apply(text_processer)
tweet_samsung_words=[d.split() for d in tweet_samsung_words]

In [29]:
# Init of the Gibbs Sampling Dirichlet Mixture Model algorithm
# K = number of potential topic (which we don't know a priori).In a real case we are not aware of the exact number of topic so we want to choose a higher value. Theoretically, GSDMM should empty useless clusters and eventually find the exact number of cluster.
# alpha & beta: kept the default parameters (which work well for several datasets). However, one might want to tune them to improve its topic allocation regarding the completeness and homogeneity of the clusters.
# n_iters = number of iterations
mgp = MovieGroupProcess(K=2, alpha=0.1, beta=0.1, n_iters=50)

vocab = set(x for doc in tweet_samsung_words for x in doc)
n_terms = len(vocab)
y = mgp.fit(tweet_samsung_words, n_terms)

doc_count = np.array(mgp.cluster_doc_count)
print('Number of documents per topics :', doc_count)
print('*'*20)

# Topics sorted by document inside
top_index = doc_count.argsort()[-10:][::-1]
print('Most important clusters (by number of docs inside):', top_index)
print('*'*20)

top_words=[]

for i in range(len(mgp.cluster_word_distribution)):
    top_words=Counter(mgp.cluster_word_distribution[i]).most_common(15)
    top_words.append(top_words)
    print(top_words)

In stage 0: transferred 961 clusters with 2 clusters populated
In stage 1: transferred 537 clusters with 2 clusters populated
In stage 2: transferred 394 clusters with 2 clusters populated
In stage 3: transferred 352 clusters with 2 clusters populated
In stage 4: transferred 308 clusters with 2 clusters populated
In stage 5: transferred 287 clusters with 2 clusters populated
In stage 6: transferred 243 clusters with 2 clusters populated
In stage 7: transferred 245 clusters with 2 clusters populated
In stage 8: transferred 248 clusters with 2 clusters populated
In stage 9: transferred 253 clusters with 2 clusters populated
In stage 10: transferred 246 clusters with 2 clusters populated
In stage 11: transferred 275 clusters with 2 clusters populated
In stage 12: transferred 263 clusters with 2 clusters populated
In stage 13: transferred 236 clusters with 2 clusters populated
In stage 14: transferred 251 clusters with 2 clusters populated
In stage 15: transferred 255 clusters with 2 clust

This list is manually generated and the topics are assigned by me. However, I could not distinguish the differences among many word clusters and ended up with similar topics for different word clusters.
1) phones
2) samsung phone product
3) samsung phone
4) samsung tv
5) samsung phone
6) samsung advertisements
7) samsung phone
8) phone

### Apple

In [30]:
tweet_apple_clean=tweet_combined_clean[tweet_combined_clean['brand']==1]
tweet_apple_words=tweet_apple_clean['text'].apply(text_processer)
tweet_apple_words=[d.split() for d in tweet_apple_words]

mgp = MovieGroupProcess(K=2, alpha=0.1, beta=0.1, n_iters=50)
vocab = set(x for doc in tweet_apple_words for x in doc)
n_terms = len(vocab)
y = mgp.fit(tweet_apple_words, n_terms)

doc_count = np.array(mgp.cluster_doc_count)
print('Number of documents per topics :', doc_count)
print('*'*20)

# Topics sorted by document inside
top_index = doc_count.argsort()[-10:][::-1]
print('Most important clusters (by number of docs inside):', top_index)
print('*'*20)

top_words=[]

for i in range(len(mgp.cluster_word_distribution)):
    top_words=Counter(mgp.cluster_word_distribution[i]).most_common(15)
    top_words.append(top_words)
    print(top_words)


In stage 0: transferred 852 clusters with 2 clusters populated
In stage 1: transferred 419 clusters with 2 clusters populated
In stage 2: transferred 343 clusters with 2 clusters populated
In stage 3: transferred 320 clusters with 2 clusters populated
In stage 4: transferred 323 clusters with 2 clusters populated
In stage 5: transferred 310 clusters with 2 clusters populated
In stage 6: transferred 281 clusters with 2 clusters populated
In stage 7: transferred 282 clusters with 2 clusters populated
In stage 8: transferred 286 clusters with 2 clusters populated
In stage 9: transferred 275 clusters with 2 clusters populated
In stage 10: transferred 278 clusters with 2 clusters populated
In stage 11: transferred 253 clusters with 2 clusters populated
In stage 12: transferred 252 clusters with 2 clusters populated
In stage 13: transferred 283 clusters with 2 clusters populated
In stage 14: transferred 261 clusters with 2 clusters populated
In stage 15: transferred 264 clusters with 2 clust

This list is manually generated. The topics are assigned by me but there are some that I could not come up with a feasible topic.
1) ipad
2) nil
3) iphone
4) Apple store
5) Music
6) Software
7) app
8) nil

### Huawei

In [31]:
tweet_huawei_clean=tweet_combined_clean[tweet_combined_clean['brand']==2]
tweet_huawei_words=tweet_huawei_clean['text'].apply(text_processer)
tweet_huawei_words=[d.split() for d in tweet_huawei_words]

mgp = MovieGroupProcess(K=2, alpha=0.1, beta=0.1, n_iters=50)
vocab = set(x for doc in tweet_huawei_words for x in doc)
n_terms = len(vocab)
y = mgp.fit(tweet_huawei_words, n_terms)

doc_count = np.array(mgp.cluster_doc_count)
print('Number of documents per topics :', doc_count)
print('*'*20)

# Topics sorted by document inside
top_index = doc_count.argsort()[-10:][::-1]
print('Most important clusters (by number of docs inside):', top_index)
print('*'*20)

top_words=[]

for i in range(len(mgp.cluster_word_distribution)):
    top_words=Counter(mgp.cluster_word_distribution[i]).most_common(15)
    top_words.append(top_words)
    print(top_words)

In stage 0: transferred 936 clusters with 2 clusters populated
In stage 1: transferred 452 clusters with 2 clusters populated
In stage 2: transferred 316 clusters with 2 clusters populated
In stage 3: transferred 297 clusters with 2 clusters populated
In stage 4: transferred 326 clusters with 2 clusters populated
In stage 5: transferred 292 clusters with 2 clusters populated
In stage 6: transferred 250 clusters with 2 clusters populated
In stage 7: transferred 238 clusters with 2 clusters populated
In stage 8: transferred 256 clusters with 2 clusters populated
In stage 9: transferred 215 clusters with 2 clusters populated
In stage 10: transferred 229 clusters with 2 clusters populated
In stage 11: transferred 247 clusters with 2 clusters populated
In stage 12: transferred 248 clusters with 2 clusters populated
In stage 13: transferred 209 clusters with 2 clusters populated
In stage 14: transferred 217 clusters with 2 clusters populated
In stage 15: transferred 211 clusters with 2 clust

This list is manually generated. The topics are assigned by me but there are some that I could not come up with a feasible topic.
1) China communist party
2) trade war
3) China on global news
4) nil
5) nil
6) Huawei product
7) Covid-19 virus
8) phone