In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/university-students-complaints-and-reports/Datasetprojpowerbi.csv


In [2]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")
from nltk.tokenize import word_tokenize
nltk.download("punkt")
from nltk.stem import PorterStemmer
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import numpy as np
import gensim
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel

df = pd.read_csv("/kaggle/input/university-students-complaints-and-reports/Datasetprojpowerbi.csv")

df["Reports_lower"] = df["Reports"].str.lower()

translator = str.maketrans(" "," ",string.punctuation)
df["Reports_nopunct"] = df["Reports_lower"].str.translate(translator)

english_stopwords = set(stopwords.words("english"))
custom_stopwords = {"im","really","student","like","feel","ive"}
combined_stopwords = english_stopwords.union(custom_stopwords)
def remove_stopwords(text):
    words = word_tokenize(text.lower())
    improved_text = [word for word in words if word not in combined_stopwords and word not in translator and not word.isdigit()]
    return " ".join(improved_text)

df["cleaned_Reports"] = df["Reports_nopunct"].apply(remove_stopwords)

stemmer = PorterStemmer()
def stem_text(text):
    words = word_tokenize(text)
    stemmed_words = [stemmer.stem(word) for word in words]
    return " ".join(stemmed_words)

df["stemmed_Reports"] = df["cleaned_Reports"].apply(stem_text)

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["stemmed_Reports"])
tfidf_df = pd.DataFrame(X.toarray(), columns = vectorizer.get_feature_names_out())

sum_tfidf = np.array(tfidf_df.sum(axis=0)).flatten()
words_tfidf_df = pd.DataFrame(list(zip(tfidf_df,sum_tfidf)), columns=["word", "tfidf_score"])
most_frequent_words = words_tfidf_df.sort_values(by="tfidf_score", ascending=False)
print(" ")
print("Most Frequent Words incl. TfIdf Score:")
print(most_frequent_words.head(10))
print(" ")

lsa = TruncatedSVD(n_components=3)
X_lsa = lsa.fit_transform(X)
terms = vectorizer.get_feature_names_out()

number_top_words = 10
lsa_topics = []
for topic in lsa.components_:
    top_terms_index = topic.argsort()[:-number_top_words - 1:-1]
    top_terms = [terms[i] for i in top_terms_index]
    lsa_topics.append(top_terms)

topic_prevalence_sum = np.sum(X_lsa, axis=0)
sorted_topics_by_sum  = np.argsort(topic_prevalence_sum)[::-1]
sorted_topic_prevalence = topic_prevalence_sum[sorted_topics_by_sum]

print(" ")
print("Sorted Topics by Prevalence:")
for idx, topic_idx in enumerate(sorted_topics_by_sum):
    print(f"Topic #{topic_idx + 1} (Prevalence: {sorted_topic_prevalence[idx]:.4f})")
    
print(" ")
print("Extracted Topics:")
for i, topic in enumerate(lsa_topics):
    print(f"Topic {i}: {', '.join(topic)}")

tokenized_docs = [word_tokenize(text) for text in df["stemmed_Reports"]]
dictionary = Dictionary(tokenized_docs)
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

coherence_model_lsa = CoherenceModel(topics=lsa_topics, texts=tokenized_docs, dictionary=dictionary, coherence="c_v")
coherence_score = coherence_model_lsa.get_coherence()

print(" ")
print(f"Coherence Score: {coherence_score}")




[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
 
Most Frequent Words incl. TfIdf Score:
         word  tfidf_score
1103  student    36.850763
1228  univers    28.569606
8      access    28.494155
5      academ    28.289367
655     limit    27.049631
439      find    26.602705
786    option    26.039072
83      avail    25.734455
688      make    25.623847
744      need    25.107747
 
 
Sorted Topics by Prevalence:
Topic #1 (Prevalence: 163.5978)
Topic #2 (Prevalence: 30.2020)
Topic #3 (Prevalence: 11.5441)
 
Extracted Topics:
Topic 0: academ, access, limit, student, need, find, cours, difficult, avail, make
Topic 1: student, univers, option, offer, opportun, wish, job, campu, provid, avail
Topic 2: health, mental, balanc, work, stress, time, respons, manag, workload, commit
 
Coherence Score: 0.631858102