In [None]:
%%capture
! pip install nltk --user
! pip install gensim --user
! pip install pyLDAvis --user
! pip install picky --user

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import gensim
import json
import nltk
import pickle
import pyLDAvis
import pyLDAvis.gensim
import string

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models.wrappers import LdaMallet
from gensim.models import CoherenceModel

from pprint import pprint

In [None]:
nltk.download('wordnet')
pyLDAvis.enable_notebook()

In [None]:
def create_dir(path, name):
  import os

  if not os.path.exists(f'{path}/{name}'):
    try:
      os.makedirs(f'{path}/{name}')
      return f'{path}/{name}'
    except OSError as e:
      print(e)
      exit()

  return f'{path}/{name}'

In [None]:
create_dir('./data', 'open-ended')
survey_df = pd.read_csv('./data/MSD Survey.csv')

In [None]:
close_ended_questions = {
  0: "Since how many semesters have you been studying?",
  1: "How many hours per week do you spend on a homework?",
  2: "Are you satisfied with the topics taught in the lecture?",
  3: "Do you intend to use the topics you learned outside of the course? [Descriptive Statistics]",
  4: "Do you intend to use the topics you learned outside of the course? [Data Mining]",
  5: "Do you intend to use the topics you learned outside of the course? [Explorative Analysis]",
  6: "Do you intend to use the topics you learned outside of the course? [Prediction model]",
  7: "Do you intend to use the topics you learned outside of the course? [Natural Language Processing]",
  8: "Do you intend to use the topics you learned outside of the course? [Python]",
  9: "Do you intend to use the topics you learned outside of the course? [Jupyter Notebook]",
  10: "Do you intend to use the topics you learned outside of the course? [Experimentation]",
  11: "Do you intend to use the topics you learned outside of the course? [Surveys]",
  12: "Rate the difficulty of the lecture",
  13: "Would you attend this course again?",
  14: "Would you recommend this course to others?"
}

open_ended_questions = {
  0: "What do you study?",
  1: "What is your profession?",
  2: "Why did you choose the lecture MSD? ",
  3: "What did you miss in the lecture? ",
  4: "Reflect on the repository mining project (topic selection, were your expectations fulfilled, learnings, etc.)",
  5: "Why do you think that the weekly exercise sheets were a good preparation for the projects or not?",
  6: "What kind of extra material did you use to solve the weekly homeworks and the project?",
  7: "What did you like in the course?",
  8: "What did you not like in the course?",
  9: "How do you agree with the  following statement: The topics taught in the lecture can be applied in various application areas",
  10:"What would you suggest for the improvement of the lecture?"
}

In [None]:
def create_question_df(survey_df, questions):
  def set_column(df, column, values): df[column] = values

  result_df = pd.DataFrame()
  [set_column(result_df, q, survey_df[q]) for q in survey_df.columns if q in questions]

  return result_df

## Topic Modeling

In [None]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

# Remove stopwords and punctuation, lemmatize words
def clean(text):
  stop_free = ' '.join([i for i in text.lower().split() if i not in stop])
  punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
  normalized = ' '.join(lemma.lemmatize(word) for word in punc_free.split())

  return normalized

In [None]:
def compute_lda_model(question_id, answers, num_topics=3, passes=50):
  path = f'./data/open-ended'
  path = create_dir(path, question_id)

  dictionary = corpora.Dictionary(answers)
  doc_term_matrix = [dictionary.doc2bow(answ) for answ in answers]
  
  pickle.dump(doc_term_matrix, open(f'{path}/corpus.pkl', 'wb'))
  dictionary.save(f'{path}/dictionary.gensim')
  
  lda_model = LdaModel(doc_term_matrix, num_topics=num_topics, id2word=dictionary, passes=passes)
  lda_model.save(f'{path}/model.gensim')

In [None]:
def display_lda_model(path, num_terms=10):
  dictionary = gensim.corpora.Dictionary.load(f'{path}/dictionary.gensim')
  corpus = pickle.load(open(f'{path}/corpus.pkl', 'rb'))
  
  lda = LdaModel.load(f'{path}/model.gensim')
  lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, R=num_terms)
  
  return lda_display

In [None]:
open_ended_df = create_question_df(survey_df, open_ended_questions)

In [None]:
for i, quest in enumerate(open_ended_df.columns):
  clean_text = []
  [clean_text.append(clean(answ).split()) for answ in open_ended_df[quest] if answ is not np.nan]
  compute_lda_model(i, clean_text)

## What do you study?

In [None]:
pyLDAvis.display(display_lda_model('./data/open-ended/0'))