### Step 5. Topic Modeling of Submissions

#### Import required libraries and read submissions csv into a Pandas dataframe

In [1]:
import pandas as pd
from datasets import Dataset
import nltk, re
from bertopic import BERTopic
from bertopic.representation import MaximalMarginalRelevance
from sentence_transformers import SentenceTransformer


In [2]:
tfcc_data = pd.read_csv('tfcc_submissions_neg1_topics.csv')

In [None]:
#### Import libraries for 

In [3]:

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer

from collections import Counter
from nltk.util import ngrams
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

#Sentiment
#nltk.download('vader_lexicon')

#Creating requency distribution
#from nltk.probability import FreqDist

#from nltk.sentiment import SentimentIntensityAnalyzer

#import plotly.express as px
#import matplotlib.pyplot as plt
#from matplotlib.pyplot import pie, axis, show
#import seaborn as sns

[nltk_data] Downloading package punkt to /home/steve/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/steve/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/steve/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/steve/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

#### Setup stop words and add additional words relevant to the subreddit (these can be found through the first one or two passes of topic modeling and may require some trial and error to identify)

Note: Remove work, time, day from stop words for first pass.  Add back in when running for -1 topic

In [4]:
stop_words = stopwords.words('english')
stop_words.extend(['call', 'customer', 'phone', 'company', 'caller', 'account', 'calls','center','x200b', 'work', 'time', 'day'])

print(len(stop_words))

191


#### Convert text to lower case, tokenize self text field, remove stop words, lemmatize tokens

In [5]:
data = tfcc_data

In [6]:
#Transforming all text to lower case
data['selftext'] = data['selftext'].astype(str).str.lower()
#data.head()

#Tokenizing selftext
tokenizer = RegexpTokenizer('\w+')
data['tokens'] = data['selftext'].apply(tokenizer.tokenize)
#data.head()

#Removning stop words using lambda
data['tokens'] = data['tokens'].apply(lambda x: [item for item in x if item not in stop_words])
#data.head()

#Converting to a string and keeping longer than 2 chars
data['tokens'] = data['tokens'].apply(lambda x: ' '.join([item for item in x if len(item) > 2]))
#data.head()

#Lemmatization of tokens
lemmatizer = WordNetLemmatizer()
data['lemmatized'] = data['tokens'].apply(lemmatizer.lemmatize)
data.head()

Unnamed: 0,id,title,selftext,author,score,num_comments,created_date,selftext_length,tokens,lemmatized,topic
0,yc4bm,"No brain, No pain.",i swear that convergys will hire any moron tha...,[deleted],28,10,2012-08-16 19:38:03,218,swear convergys hire moron walk door keep bodi...,swear convergys hire moron walk door keep bodi...,-1
1,yamv8,"""how fast does your modems go ?",starting off the awesome subreddit\n\ni used t...,[deleted],34,4,2012-08-16 01:24:57,200,starting awesome subreddit used tech support g...,starting awesome subreddit used tech support g...,-1
2,ymjc0,"Tech support agent, and yet I can't touch my c...",at my work we are not allowed to adjust the mo...,hanzors,40,19,2012-08-22 06:27:29,104,allowed adjust monitors plug cables anything t...,allowed adjust monitors plug cables anything t...,-1
3,yqy54,She done did all dem tings,after spending the past week in deepest darkes...,PoglaTheGrate,15,2,2012-08-24 11:01:32,287,spending past week deepest darkest northern ns...,spending past week deepest darkest northern ns...,-1
4,yql1h,"""Why can't I keep my stolen cable?!?!?""",i used to work in a 3rd party call center. one...,thisisrequired,33,6,2012-08-24 04:39:44,301,used 3rd party one campaigns calling people le...,used 3rd party one campaigns calling people le...,-1


#### Convert pandas dataframe to huggingface dataset

In [7]:
tfcc_data = data.to_dict('list')
    
# Create a Hugging Face dataset from the dictionary
dataset = Dataset.from_dict(tfcc_data)

In [8]:
dataset

Dataset({
    features: ['id', 'title', 'selftext', 'author', 'score', 'num_comments', 'created_date', 'selftext_length', 'tokens', 'lemmatized', 'topic'],
    num_rows: 5316
})

#### Create topic model with bertopic.  This takes about 1 min a laptop with 8 GB GPU, 32 GB RAM

NOTE: when running the second time for the -1 topics, update diversity to 0.4

In [9]:
sentence_model = SentenceTransformer("all-MiniLM-L12-v2")
embeddings = sentence_model.encode(dataset["lemmatized"], show_progress_bar=False)
representation_model = MaximalMarginalRelevance(diversity=0.4)
model = BERTopic(embedding_model=sentence_model, representation_model=representation_model)
topics, probs = model.fit_transform(dataset['lemmatized'])

In [10]:
model.get_topic_info()

details = model.get_topic_info()

#### Update the dataframe to include the topic number for each submission

In [11]:
data['topic'] = topics

In [12]:
details

Unnamed: 0,Topic,Count,Name
0,-1,3491,-1_get_need_name_service
1,0,165,0_name_thank_need_lady
2,1,119,1_support_tech_software_device
3,2,99,2_payments_refund_bank_card
4,3,89,3_insurance_claim_car_name
...,...,...,...
56,55,11,55_claims_weeks_client_pregnancy
57,56,11,56_prices_pandemic_milk_stores
58,57,11,57_code_item_tenant_marketing
59,58,11,58_masculine_lights_department_want


#### There are a high number of topic -1, which are basically submissions that fall into a general category.  We'll save these to a separate CSV to look at later

In [13]:
neg1_topic = data.loc[(data['topic'] == -1)]
neg1_topic.to_csv('tfcc_submissions_neg1_topics.csv', index=False)

#### Take a look at the top 20 topics

In [13]:
model.visualize_barchart(top_n_topics=20, n_words=5)


### Filter dataset to only the topics of interest for analysis.  In this case we'll keep top 20 topics which are 0 - 19.  Tokens and Lemmatized columns are no longer required and are dropped.

In [14]:
data = data.drop(['tokens', 'lemmatized'], axis=1)


In [15]:
data = data.loc[(data['topic'] >= 0) & (data['topic'] <= 19)]


In [16]:
data

Unnamed: 0,id,title,selftext,author,score,num_comments,created_date,selftext_length,topic
2,ymjc0,"Tech support agent, and yet I can't touch my c...",at my work we are not allowed to adjust the mo...,hanzors,40,19,2012-08-22 06:27:29,104,1
9,10mbtu,Passwords,"so i work for a major cellphone company, my se...",[deleted],19,12,2012-09-28 14:41:55,245,17
11,10qjhg,"""Treat me like a friend.""","got a customer the other day who, in the middl...",TroubleEntendre,37,9,2012-10-01 00:53:29,104,12
12,10vp4l,Any stories/rants from chat reps?,i am a chat rep for a major financial firm. i ...,solidsnake78,11,11,2012-10-03 15:58:30,300,9
14,11ddgv,Good universal script for transferring a call?,"not sure if this is the best place to ask, but...",beepbeep27,7,23,2012-10-12 16:16:57,107,19
...,...,...,...,...,...,...,...,...,...
5293,z8k8s6,"Be careful what you ask for, you just might ge...","i work in a call centre. it's a small-ish, fam...",ig0tst0ries,106,7,2022-11-30 06:52:58,312,18
5297,zdhg4r,Most irritating survey response,i had this call with this woman where i did ev...,Toritoise,280,64,2022-12-05 19:50:53,186,0
5302,zh95z0,How do you combat the urge to assume everyone’...,i work at a utility company. people call in ...,gameofthrones_addict,93,49,2022-12-09 21:23:33,355,2
5308,zp0w7a,"If anyone DOESNT need knives, it’s him.","before where i am now, i was customer service ...",bremariemantis,170,11,2022-12-18 15:19:39,252,14


### The submission counts for the topics

In [17]:
details.head(20)

Unnamed: 0,Topic,Count,Name
0,-1,3491,-1_get_need_name_service
1,0,165,0_name_thank_need_lady
2,1,119,1_support_tech_software_device
3,2,99,2_payments_refund_bank_card
4,3,89,3_insurance_claim_car_name
5,4,66,4_bill_credit_people_payment
6,5,65,5_client_number_order_address
7,6,58,6_feel_jobs_life_centers
8,7,58,7_customers_service_screaming_like
9,8,57,8_service_amp_credit_isp


#### Export the filtered dataset and the topic counts for the top 20 topics.

In [18]:
details.to_csv('tfcc_submissions_topic_counts_neg1', index=False)

In [19]:
data.to_csv('tfcc_submissions_selected_topics_neg1.csv', index=False)

#### Run from top again on the -1 topics from first pass