### Installing BERTopic and importing all necessary libraries

In [None]:
!pip install bertopic

Collecting bertopic
  Downloading bertopic-0.16.2-py2.py3-none-any.whl (158 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.8/158.8 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.37-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap_learn-0.5.6-py3-none-any.whl (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Collecting sentence-transformers>=0.4.1 (from bertopic)
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
Collecting cython<3,>=0.27 (from hdbscan>=0.8.29->bertopic)
  Dow

In [None]:
import numpy as np
import pandas as pd
from bertopic import BERTopic
import nltk
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
import re

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


### Reading the excel files

In [None]:
data = pd.read_excel("/content/drive/MyDrive/DissertationProject/Data/NSS2023_BrunelStudentComments_published2023-10-11.xlsx",header=[2])
data2023 = pd.read_excel("/content/drive/MyDrive/DissertationProject/Data/NSS2023_BrunelStudentComments_published2023-10-11.xlsx",header=[2])
data2022 = pd.read_excel("/content/drive/MyDrive/DissertationProject/Data/NSS2022_StudentComments_ALL_pub2022_0922.xlsx",header=[3])
data2021 = pd.read_excel("/content/drive/MyDrive/DissertationProject/Data/NSS2021_ StudentComments_ALL.xlsx",header=[3],sheet_name="NSS2021Comments")


data2023 = data2023[['Positive comments','Negative comments']]
data2023 = data2023.rename(columns={'Positive comments': 'Positive comment', 'Negative comments': 'Negative comment'})
data2022 = data2022[['Positive comment','Negative comment']]
data2021 = data2021[['Positive comment','Negative comment']]


In [None]:
combined_data = pd.concat([data2021, data2022, data2023])
#combined_data.head()
#combined_data.tail()
#data = combined_data

# To switch to data of year 202n
data = data2023


In [None]:
data.info()
data.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1022 entries, 0 to 1021
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Positive comment  920 non-null    object
 1   Negative comment  870 non-null    object
dtypes: object(2)
memory usage: 16.1+ KB


Positive comment    102
Negative comment    152
dtype: int64

In [None]:
data = data.dropna()

In [None]:
docs_pos = data['Positive comment']
docs_neg = data['Negative comment']
#docs = pd.concat([docs_pos, docs_neg])

In [None]:
docs_pos.tail()

1015    It was good that along with the accounting deg...
1016    Tutors are very good in explaining and making ...
1018    The involvement of staff from industry was vit...
1019    Teaching staff wanted to give us extra help an...
1020    The 12-month placements were the best parts of...
Name: Positive comment, dtype: object

### Preprocessing

In [None]:
#Preprocessing


stop = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemma = WordNetLemmatizer

# To remove numbers
def remove_num(text):
    output = re.sub(r'\d+', '', text )
    return output

# To unify whitespaces
def unify_whitespaces(text):
    cleaned_string = re.sub(' +', ' ', text )
    return cleaned_string

# To remove punctuation
def remove_punctuation(text):
    result = "".join(u for u in text if u not in ("?", ".", ";", ":",  "!",'"',',') )
    return result

# To remove stopwords
def remove_stopwords(text):
  text = [word.lower() for word in text.split() if word.lower() not in stop ]
  return ' '.join(text)

# To Apply Stemming
def Stemming(text):
    stem = []
    stopword = stopwords.words('english')
    snowball_stemmer = SnowballStemmer('english')
    word_tokens = nltk.word_tokenize(text)
    stemmed_word = [ snowball_stemmer.stem(word) for word in word_tokens ]
    stem = ' '.join(stemmed_word)
    return stem

# To Apply Lemmatizing
def Lemmatizing(text):
    lemmatizer = WordNetLemmatizer()
    stopword = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)

    lemmatized_words = [lemmatizer.lemmatize(word) for word in word_tokens if word.lower() not in stopword]
    lemmatized_text = ' '.join(lemmatized_words)
    return lemmatized_text




In [None]:
#Applying all the cleaning functions
def cleaning(data):
    data_processed = data.copy()
    data_processed = data_processed.apply(remove_num)
    data_processed = data_processed.apply(remove_punctuation)
    data_processed = data_processed.apply(unify_whitespaces)
    data_processed = data_processed.apply(remove_stopwords)
    data_processed = data_processed.drop_duplicates()
    #data_processed = data_processed.apply(Stemming)
    data_processed = data_processed.apply(Lemmatizing)
    return data_processed

docs_pos_processed = cleaning(docs_pos)
docs_neg_processed = cleaning(docs_neg)

In [None]:
docs_pos_processed.head()

0                           practical project engaging
1    many professor passionate subject matter many ...
2    campus equipment good reliable tema 's project...
3    teaching staff content good level amount infor...
4                great exam online level stress person
Name: Positive comment, dtype: object

#### Choose your embedding model

In [None]:
#model = BERTopic(embedding_model="all-mpnet-base-v2")
model = BERTopic(embedding_model="all-MiniLM-L12-v2")

Fitting the model on the documents

In [None]:
topics, probs = model.fit_transform(docs_pos_processed)
topic_labels = model.generate_topic_labels(topic_prefix=False, separator=' - ')
model.set_topic_labels(topic_labels)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name,CustomName,Representation,Representative_Docs
0,-1,308,-1_lecturer_student_good_course,lecturer - student - good,"[lecturer, student, good, course, really, well...",[teaching thorough considerate throughout staf...
1,0,67,0_staff_teaching_helpful_always,staff - teaching - helpful,"[staff, teaching, helpful, always, supportive,...","[teaching staff friendly helpful, teaching sta..."
2,1,65,1_course_module_lecture_seminar,course - module - lecture,"[course, module, lecture, seminar, well, learn...",[content within module great always new thing ...
3,2,39,2_resource_online_library_access,resource - online - library,"[resource, online, library, access, lecture, g...",[everything positive course easy access teache...
4,3,32,3_brunel_life_experience_also,brunel - life - experience,"[brunel, life, experience, also, student, desi...",[teaching staff game department incredible 're...
5,4,30,4_everything_fine_experience_positive,everything - fine - experience,"[everything, fine, experience, positive, nothi...","[everything good, everything seems fine conten..."
6,5,26,5_mental_health_support_wellbeing,mental - health - support,"[mental, health, support, wellbeing, service, ...","[mental health service really good, mental hea..."
7,6,26,6_tutor_personal_help_talk,tutor - personal - help,"[tutor, personal, help, talk, advice, support,...",[conversation discussion personal tutor crucia...
8,7,23,7_engaging_lecturer_lecture_interactive,engaging - lecturer - lecture,"[engaging, lecturer, lecture, interactive, int...",[staff always help lecturer lecture interactiv...
9,8,18,8_opportunity_work_valuable_allowed,opportunity - work - valuable,"[opportunity, work, valuable, allowed, skill, ...",[learned lot valuable skill developed understa...


In [None]:
model.get_topic(2)

[('resource', 0.09514751650078064),
 ('online', 0.09257825174732207),
 ('library', 0.06383516870557547),
 ('access', 0.05047936916673139),
 ('lecture', 0.046971741056428526),
 ('good', 0.04336769676289636),
 ('recording', 0.04328156478871801),
 ('useful', 0.041620941933462816),
 ('brightspace', 0.03938694003609443),
 ('easy', 0.03927956255995771)]

In [None]:
model.get_representative_docs(2)

['everything positive course easy access teacher online resource',
 'library resource good',
 'year online even hybrid module online in-person come realise advantage particularly people physical disability mobility problem except able properly use lab due pandemic would good learning opportunity physically campus may good able interact another location live lecture useful expect come serious health issue accident final year come miss able participate interactive live online class particularly low-mobility day like learning get recording lecture despite able message lecturer subject lecture really miss feeling actually time lecture interacting peer & lecturer many lecturer kind patient really appreciate really great many event opportunity present probably would liked work worry spend time besides attending full-time university missed many workshop career event really wanted go']

In [None]:
model.visualize_topics()

In [None]:
model.visualize_barchart()

In [None]:
model.get_topic_freq()

Unnamed: 0,Topic,Count
0,-1,308
5,0,67
1,1,65
9,2,39
4,3,32
2,4,30
13,5,26
14,6,26
10,7,23
3,8,18


## For Negative Comments


In [None]:
#model_neg = BERTopic(embedding_model="all-mpnet-base-v2")
model_neg = BERTopic(embedding_model="all-MiniLM-L12-v2")
topics_neg, probs_neg = model_neg.fit_transform(docs_neg_processed)
topic_labels = model_neg.generate_topic_labels(topic_prefix=False, separator=' - ')
model_neg.set_topic_labels(topic_labels)
model_neg.get_topic_info()

Unnamed: 0,Topic,Count,Name,CustomName,Representation,Representative_Docs
0,-1,243,-1_student_year_course_module,student - year - course,"[student, year, course, module, time, teaching...",[course always felt slow start ( beginning ter...
1,0,109,0_feedback_assessment_marking_exam,feedback - assessment - marking,"[feedback, assessment, marking, exam, mark, as...",[feedback really bad & professor take time mar...
2,1,84,1_lecturer_lecture_seminar_engaging,lecturer - lecture - seminar,"[lecturer, lecture, seminar, engaging, interac...","[sometimes lecture long engaging, lecturer eng..."
3,2,50,2_placement_staff_student_would,placement - staff - student,"[placement, staff, student, would, year, cours...",[staff awful response either get non-helpful r...
4,3,32,3_module_year_work_available,module - year - work,"[module, year, work, available, course, also, ...",[certain module international investment arbit...
5,4,31,4_covid_experience_year_due,covid - experience - year,"[covid, experience, year, due, feel, first, on...","[experience really good covid-, get full exper..."
6,5,28,5_negative_nothing_ca_sure,negative - nothing - ca,"[negative, nothing, ca, sure, none, really, pa...","[negative, nothing particularly negative, noth..."
7,6,26,6_online_book_tuition_lecture,online - book - tuition,"[online, book, tuition, lecture, learning, yea...","[online study, couple lecturer well prepared w..."
8,7,24,7_timetable_time_timetabling_deadline,timetable - time - timetabling,"[timetable, time, timetabling, deadline, march...",[everything including email timetabling etc on...
9,8,22,8_organisation_course_organised_club,organisation - course - organised,"[organisation, course, organised, club, year, ...",[issue time organisation wise would hear lot s...


In [None]:
model_neg.visualize_topics()

In [None]:
model_neg.visualize_barchart()