### Step 5. Topic Modeling of Submissions

#### Import required libraries and read submissions csv into a Pandas dataframe

In [1]:
import pandas as pd
from datasets import Dataset
import nltk, re
from bertopic import BERTopic
from bertopic.representation import MaximalMarginalRelevance
from sentence_transformers import SentenceTransformer


#### Import CSV file from steps 2 - 4

In [2]:
tfcc_data = pd.read_csv('tfcc_submissions_for_top_modeling.csv')

In [None]:

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer

from collections import Counter
from nltk.util import ngrams
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

#### Setup stop words and add additional words relevant to the subreddit (these can be found through the first one or two passes of topic modeling and may require some trial and error to identify)

Note: Remove work, time, day from stop words for first pass.  Add back in when running for -1 topic

In [4]:
stop_words = stopwords.words('english')
stop_words.extend(['call', 'customer', 'phone', 'company', 'caller', 'account', 'calls','center','x200b', 'work', 'time', 'day'])

print(len(stop_words))

191


#### Convert text to lower case, tokenize self text field, remove stop words, lemmatize tokens

In [5]:
data = tfcc_data

In [6]:
#Transforming all text to lower case
data['selftext'] = data['selftext'].astype(str).str.lower()
#data.head()

#Tokenizing selftext
tokenizer = RegexpTokenizer('\w+')
data['tokens'] = data['selftext'].apply(tokenizer.tokenize)
#data.head()

#Removning stop words using lambda
data['tokens'] = data['tokens'].apply(lambda x: [item for item in x if item not in stop_words])
#data.head()

#Converting to a string and keeping longer than 2 chars
data['tokens'] = data['tokens'].apply(lambda x: ' '.join([item for item in x if len(item) > 2]))
#data.head()

#Lemmatization of tokens
lemmatizer = WordNetLemmatizer()
data['lemmatized'] = data['tokens'].apply(lemmatizer.lemmatize)
data.head()

Unnamed: 0,id,title,selftext,author,score,num_comments,created_date,selftext_length,tokens,lemmatized
0,yc4bm,"No brain, No pain.",i swear that convergys will hire any moron tha...,[deleted],28,10,2012-08-16 19:38:03,218,swear convergys hire moron walk door keep bodi...,swear convergys hire moron walk door keep bodi...
1,yamv8,"""how fast does your modems go ?",starting off the awesome subreddit\n\ni used t...,[deleted],34,4,2012-08-16 01:24:57,200,starting awesome subreddit used tech support g...,starting awesome subreddit used tech support g...
2,ydntm,"[I can't tell you] information line, this is P...",yay! 28th subscriber!\n\ni used to work for a...,PoglaTheGrate,29,4,2012-08-17 15:02:53,463,yay 28th subscriber used government informatio...,yay 28th subscriber used government informatio...
3,yicc4,Dishwasher blues,the stories i've heard amazes me but this one ...,[deleted],31,15,2012-08-20 05:05:34,431,stories heard amazes one takes cake tell peopl...,stories heard amazes one takes cake tell peopl...
4,ymjc0,"Tech support agent, and yet I can't touch my c...",at my work we are not allowed to adjust the mo...,hanzors,40,19,2012-08-22 06:27:29,104,allowed adjust monitors plug cables anything t...,allowed adjust monitors plug cables anything t...


#### Convert pandas dataframe to huggingface dataset

In [7]:
tfcc_data = data.to_dict('list')
    
# Create a Hugging Face dataset from the dictionary
dataset = Dataset.from_dict(tfcc_data)

In [8]:
dataset

Dataset({
    features: ['id', 'title', 'selftext', 'author', 'score', 'num_comments', 'created_date', 'selftext_length', 'tokens', 'lemmatized'],
    num_rows: 10129
})

#### Create topic model with BERTopic.  This takes about 1 min on a laptop with 8 GB GPU, 32 GB RAM

While the documentation for BERTopic indicates stop word removal and tokenization may not be necessary, I found better results when including these steps

NOTE: when running the second time for the -1 topics, update diversity to 0.4

In [9]:
sentence_model = SentenceTransformer("all-MiniLM-L12-v2")
embeddings = sentence_model.encode(dataset["lemmatized"], show_progress_bar=False)
representation_model = MaximalMarginalRelevance(diversity=0.4)
model = BERTopic(embedding_model=sentence_model, representation_model=representation_model)
topics, probs = model.fit_transform(dataset['lemmatized'])

In [10]:
model.get_topic_info()

details = model.get_topic_info()

#### Update the dataframe to include the topic number for each submission

In [11]:
data['topic'] = topics

In [12]:
details

Unnamed: 0,Topic,Count,Name
0,-1,4561,-1_get_like_number_need
1,0,1048,0_tech_cable_internet_support
2,1,893,1_card_bank_payment_name
3,2,655,2_insurance_clinic_need_car
4,3,544,3_job_feel_working_training
...,...,...,...
57,56,10,56_withdraw_hardship_401k_payroll
58,57,10,57_shipping_express_haircut_pennies
59,58,10,58_email_consent_total_accountant
60,59,10,59_gooddad_dazb_replacement_prepaid


#### There are a high number of topic -1, which are basically submissions that fall into a general category.  We'll save these to a separate CSV to look at later

In [13]:
neg1_topic = data.loc[(data['topic'] == -1)]
neg1_topic.to_csv('tfcc_submissions_neg1_topics.csv', index=False)

#### Take a look at the top 20 topics

In [14]:
model.visualize_barchart(top_n_topics=20, n_words=5)


### Filter dataset to only the topics of interest for analysis.  In this case we'll keep top 20 topics which are 0 - 19.  Tokens and Lemmatized columns are no longer required and are dropped.

In [15]:
data = data.drop(['tokens', 'lemmatized'], axis=1)


In [16]:
data = data.loc[(data['topic'] >= 0) & (data['topic'] <= 19)]


In [17]:
data

Unnamed: 0,id,title,selftext,author,score,num_comments,created_date,selftext_length,topic
1,yamv8,"""how fast does your modems go ?",starting off the awesome subreddit\n\ni used t...,[deleted],34,4,2012-08-16 01:24:57,200,0
3,yicc4,Dishwasher blues,the stories i've heard amazes me but this one ...,[deleted],31,15,2012-08-20 05:05:34,431,0
4,ymjc0,"Tech support agent, and yet I can't touch my c...",at my work we are not allowed to adjust the mo...,hanzors,40,19,2012-08-22 06:27:29,104,0
5,yp5ab,Old guy likes to call cable company call cente...,i worked in the call center for a cable compan...,BananaVisit,48,9,2012-08-23 15:04:54,223,0
12,zgoxq,Hurricanes,i was working at an online christian bookstore...,Ashleyrah,39,2,2012-09-06 19:11:23,154,4
...,...,...,...,...,...,...,...,...,...
10121,zwnowo,Why is it so difficult to figure out for custo...,i work for a utility company. i had a call ea...,gameofthrones_addict,57,16,2022-12-27 19:25:20,255,5
10122,zwohz7,Something I've been thinking about...,does anyone else think that if these call cent...,ghostof_lisasbabytoe,22,12,2022-12-27 19:58:39,225,3
10125,zxn7bt,Call Totals Giving me Anxiety,does anyone else have to make a certain amount...,BatBitch1016,45,7,2022-12-28 22:31:50,165,3
10126,zz7684,"kudos to you guys, I don't know how you do it.","i worked in retail for 7 years, recently took ...",Fact0ry0fSadness,211,54,2022-12-30 18:07:48,198,3


### The submission counts for the topics

In [18]:
details.head(20)

Unnamed: 0,Topic,Count,Name
0,-1,4561,-1_get_like_number_need
1,0,1048,0_tech_cable_internet_support
2,1,893,1_card_bank_payment_name
3,2,655,2_insurance_clinic_need_car
4,3,544,3_job_feel_working_training
5,4,268,4_delivery_store_orders_package
6,5,244,5_bill_meter_payments_due
7,6,144,6_tow_roadside_truck_assistance
8,7,143,7_people_like_say_voice
9,8,102,8_guest_reservations_hotels_booking


#### Export the filtered dataset and the topic counts for the top 20 topics.

In [19]:
details.to_csv('tfcc_submissions_topic_counts.csv', index=False)

In [20]:
data.to_csv('tfcc_submissions_selected_topics.csv', index=False)

#### Run from top again on the -1 topics from first pass