In [1]:
import gensim
from gensim import corpora
from gensim.models import LdaModel
from gensim.models import CoherenceModel
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Downloading NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Loading data
tickets_df = pd.read_csv('.\input_to_unsupervised.csv',index_col="Ticket ID")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sksub\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sksub\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sksub\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Preprocessing function
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text.lower())
    
    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return tokens

In [3]:
# Preprocess the text data
tickets_df['tokens'] = tickets_df["Ticket Description"].apply(preprocess_text)

# Creating a dictionary representation of the documents
dictionary = corpora.Dictionary(tickets_df['tokens'])

# Creating a bag-of-words representation of the documents
corpus = [dictionary.doc2bow(tokens) for tokens in tickets_df['tokens']]

# Training the LDA model
lda_model = LdaModel(corpus=corpus,
                     id2word=dictionary,
                     num_topics=50,
                     random_state=42,
                     passes=10)

In [4]:
# Model Evaluation
coherence_model_lda = CoherenceModel(model=lda_model, texts=tickets_df['tokens'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f'Coherence Score: {coherence_lda}')

Coherence Score: 0.4425237348615802


In [5]:
# Assign topics to tickets
topics = lda_model.get_document_topics(corpus)

# Add topics to DataFrame
tickets_df['topic'] = [max(t, key=lambda x: x[1])[0] for t in topics]


In [6]:
# Change numbers to topic 
for topic_id, topic_words in lda_model.print_topics():
    tickets_df["topic"][tickets_df["topic"]==topic_id]=topic_words

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tickets_df["topic"][tickets_df["topic"]==topic_id]=topic_words
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tickets_df["topic"][tickets_df["topic"]==topic_id]=topic_words


In [10]:
#Saving the data
tickets_df.to_csv('.\outputs.csv')