In [None]:
import os
import pandas as pd
import nltk
from nltk import word_tokenize
from gensim.models.wrappers import LdaMallet
import gensim.corpora as corpora
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
def load(path):
    review_data = pd.read_csv(path)
    return review_data

In [None]:
def processing(review_data):
    review_data['clean_review'] = review_data['review'].str.lower()
    review_data['clean_review'] = review_data['clean_review'].str.replace(r'[^a-zA-Z\s]', ' ',regex=True) 
    review_data['clean_review'] = review_data['clean_review'].str.replace(r'\s{2,}', ' ',regex=True)
    review_data['clean_review'] = review_data['clean_review'].apply(word_tokenize)
    review_data['clean_review'] = review_data['clean_review'].apply(lambda x:[word for word in x if word not in stopwords.words("english") and len(word) > 3 and word.isalpha()])
    review_data['clean_review'] = review_data['clean_review'].apply(lambda x: [WordNetLemmatizer().lemmatize(word) for word in x])
    review_data['clean_review'] = review_data['clean_review'].apply(lambda x: [word for word in x if nltk.pos_tag([word])[0][1] == 'NN'])
    review_data = review_data[review_data['clean_review'].map(lambda x: len(x)) > 1].reset_index(drop=True)
    return review_data

In [None]:
def matrix_creation(review_data):
    texts = review_data['clean_review']
    id2word = corpora.Dictionary(texts)
    corpus = [id2word.doc2bow(text) for text in texts]
    return texts, id2word, corpus

In [None]:
def load_mallet(system,folder_path):
    os.environ['MALLET_HOME']=folder_path
    if system == 'windows': mallet_path = folder_path+"\\bin\\mallet.bat"
    elif system == 'mac': mallet_path = folder_path+"/bin/mallet"
    return mallet_path

In [None]:
def create_mallet(mallet_path,num_topics,id2word,corpus):
    ldamallet = LdaMallet(mallet_path=mallet_path,num_topics=num_topics,corpus=corpus,id2word=id2word,random_seed=10)
    return ldamallet

In [None]:
def topic_match(model,corpus,texts,data):
    output = pd.DataFrame()
    topics = {'0':'App Responsiveness',
              '1':'Money Growth (Interest Rates)',
              '2':'Customer Services',
              '3':'Services & Products',
              '4':'User Interface',
              '5':'Credit card',
              '6':'Login & Account Setup',
              '7':'Competition',
              '8':'Safety',
              '9':'Customer trust'}
    # Get main topic in each document
    for i, row in enumerate(model[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        print(i,row)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        if not texts[i]:
            output = output.append(pd.Series([10,'Others',1.000,'']),ignore_index=True)
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = model.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                output = output.append(pd.Series([int(topic_num),topics[str(int(topic_num))], round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    output.columns = ['Topic Number', 'Topic', 'Perc_Contribution', 'Topic_Keywords']
    output = pd.concat([data,output], axis=1)
    output = output.drop(['clean_review','Perc_Contribution','Topic_Keywords'],axis=1)
    return output

In [None]:
review_data = load("https://github.com/rhyden-kx/SentimentPro/blob/main/data/Reviews.csv?raw=true")
review_data = processing(review_data=review_data)
texts, id2word, corpus = matrix_creation(review_data=review_data)
mallet_path = load_mallet("windows","C:\\Users\\user\\Downloads\\mallet-2.0.8\\mallet-2.0.8")
mallet = create_mallet(mallet_path=mallet_path,num_topics=10,id2word=id2word,corpus=corpus)
output = topic_match(model=mallet,corpus=corpus,texts=texts,data=review_data)