In [16]:
import pandas as pd
import numpy as np
import nltk
import numpy as np
import seaborn as sns
from tqdm import tqdm
import tomotopy as tp
import spacy

def combine(x):
        out = ""
        cols = ['title', 'pros', 'cons', "text"]
        for col in cols:
                if type(x[col]) == str:
                        out += x[col] + ". "
        return out

companies = ['uber', 'uber-drivers', 'ubereats', 'lyft', 'instacart', 'grubhub', 'doordash', 'postmates']
nlp = spacy.load('en_core_web_sm')

def text2lemma(text):
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc 
                         if not token.is_stop and token.text not in companies]

    return " ".join(lemmas)

def number_of_months(date):
    return (date.year - 2018) * 12 + date.month - 1

We model topics based on **all the data** from Indeed (over **all the periods**). 

**Output**:
    Pretrained model

In [17]:
paths = ["indeed_uber_reviews.csv", "indeed_Uber-Drivers_reviews.csv", 
         "indeed_Ubereats_reviews.csv", "indeed_Lyft_reviews.csv", "indeed_Instacart_reviews.csv",
         "indeed_Grubhub_reviews.csv", "indeed_Doordash_reviews.csv", "indeed_Postmates_reviews.csv"]
dataframes = []
for path in paths:
    _df = pd.read_csv(path)
    _df["company"] = path.split("_")[1].lower()
    dataframes.append(_df)
    
df = pd.concat(dataframes).drop_duplicates().set_index("id").drop(["url", "language"], axis=1)

df["datetime"] = pd.to_datetime(df["datetime"])
df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df = df[ (df["year"] > 2017) & (df["year"] < 2023)]

df["combined_text"] = df.apply(lambda x: combine(x), axis=1)
df['processed_text'] = df['combined_text'].apply(text2lemma)

In [20]:
corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer())

for i, row in tqdm(df.iterrows()):
    text = row["processed_text"]
    corpus.add_doc(raw=text)

model = tp.PAModel(k1=1, k2=5,  tw=2, min_cf=10, corpus=corpus, seed=0)
model.train(1000, workers=4)
model.save("topic_modeling.model")

25228it [00:02, 11204.97it/s]
  model.train(1000, workers=4)


In [21]:
top_n = 10
for k in range(model.k2):
    print("TOPIC", k)
    print("Words:")
    print([item[0] for item in model.get_topic_words(k, top_n=top_n)])
    print("==========================")

TOPIC 0
Words:
['job', 'great', 'work', 'good', 'money', 'time', 'schedule', 'want', 'people', 'hour']
TOPIC 1
Words:
['pay', 'order', 'tip', 'delivery', 'hour', 'time', 'gas', 'money', 'day', 'low']
TOPIC 2
Words:
['people', 'uber', 'work', 'driver', 'great', 'lyft', 'company', 'drive', 'good', 'job']
TOPIC 3
Words:
['driver', 'customer', 'support', 'uber', 'company', 'pay', 'care', 'car', 'app', 'order']
TOPIC 4
Words:
['work', 'company', 'good', 'job', 'great', 'people', 'want', 'like', 'time', 'schedule']
