In [None]:
%%capture
import sys
!{sys.executable} -m pip install nltk
!{sys.executable} -m pip install gensim
!{sys.executable} -m pip install pyLDAvis
!{sys.executable} -m pip install picky

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import gensim
import json
import nltk
import pickle
import pyLDAvis
import pyLDAvis.gensim
import string

from plotly import graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected = True)

from nltk.sentiment.util import demo_liu_hu_lexicon
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import opinion_lexicon, stopwords, sentiwordnet as swn
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

from gensim import corpora
from gensim.models import CoherenceModel
from gensim.models.ldamodel import LdaModel

from pprint import pprint

In [None]:
nltk.download('wordnet')
nltk.download('opinion_lexicon')
nltk.download('stopwords')

pyLDAvis.enable_notebook()

In [None]:
def create_dir(path, name):
  import os

  if not os.path.exists(f'{path}/{name}'):
    try:
      os.makedirs(f'{path}/{name}')
      return f'{path}/{name}'
    except OSError as e:
      print(e)
      exit()

  return f'{path}/{name}'

In [None]:
create_dir('./data', 'open-ended')
survey_df = pd.read_csv('./data/MSD Survey.csv')

In [None]:
ce_questions = {
  0: "Since how many semesters have you been studying?",
  1: "How many hours per week do you spend on a homework?",
  2: "Are you satisfied with the topics taught in the lecture?",
  3: "Do you intend to use the topics you learned outside of the course? [Descriptive Statistics]",
  4: "Do you intend to use the topics you learned outside of the course? [Data Mining]",
  5: "Do you intend to use the topics you learned outside of the course? [Explorative Analysis]",
  6: "Do you intend to use the topics you learned outside of the course? [Prediction model]",
  7: "Do you intend to use the topics you learned outside of the course? [Natural Language Processing]",
  8: "Do you intend to use the topics you learned outside of the course? [Python]",
  9: "Do you intend to use the topics you learned outside of the course? [Jupyter Notebook]",
  10: "Do you intend to use the topics you learned outside of the course? [Experimentation]",
  11: "Do you intend to use the topics you learned outside of the course? [Surveys]",
  12: "Rate the difficulty of the lecture",
  13: "Would you attend this course again?",
  14: "Would you recommend this course to others?"
}

oe_questions = {
  0: "What do you study?",
  1: "What is your profession?",
  2: "Why did you choose the lecture MSD? ",
  3: "What did you miss in the lecture? ",
  4: "Reflect on the repository mining project (topic selection, were your expectations fulfilled, learnings, etc.)",
  5: "Why do you think that the weekly exercise sheets were a good preparation for the projects or not?",
  6: "What kind of extra material did you use to solve the weekly homeworks and the project?",
  7: "What did you like in the course?",
  8: "What did you not like in the course?",
  9: "How do you agree with the  following statement: The topics taught in the lecture can be applied in various application areas",
  10:"What would you suggest for the improvement of the lecture?"
}

## Close Ended Questions Analysis

In [None]:
def plot_ce_question_stats(question):
  labels = survey_df[question].value_counts().reset_index().values[:, 0]
  values = survey_df[question].value_counts().reset_index().values[:, 1]

  iplot({
    "data": [{
        "values": values,
        "labels": labels,
        "type": "pie"
        }],
      "layout": { "title": question}
    })

In [None]:
for q in ce_questions:
  plot_ce_question_stats(ce_questions[q])

# Topic Modeling

In [None]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
stemmer = PorterStemmer()

# Remove stopwords and punctuation, lemmatize words
def clean(text):
  text = stemmer.stem(text)
  no_stop = ' '.join([i for i in text.lower().split() if i not in stop])
  no_punc = ''.join(ch for ch in no_stop if ch not in exclude)
  cleaned = ' '.join(lemma.lemmatize(word) for word in no_punc.split())

  return cleaned

In [None]:
def compute_lda_model(question_id, answers, num_topics=3, passes=50):
  path = f'./data/open-ended'
  path = create_dir(path, question_id)

  dictionary = corpora.Dictionary(answers)
  doc_term_matrix = [dictionary.doc2bow(answ) for answ in answers]
  
  pickle.dump(doc_term_matrix, open(f'{path}/corpus.pkl', 'wb'))
  dictionary.save(f'{path}/dictionary.gensim')
  
  lda_model = LdaModel(doc_term_matrix, num_topics=num_topics, id2word=dictionary, passes=passes)
  lda_model.save(f'{path}/model.gensim')

In [None]:
def display_lda_model(path, num_terms=10):
  dictionary = gensim.corpora.Dictionary.load(f'{path}/dictionary.gensim')
  corpus = pickle.load(open(f'{path}/corpus.pkl', 'rb'))
  
  lda = LdaModel.load(f'{path}/model.gensim')
  lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, R=num_terms)
  
  return lda_display

In [None]:
for i in oe_questions:
  clean_text = []
  [clean_text.append(clean(answ).split()) for answ in survey_df[oe_questions[i]] if answ is not np.nan]
  compute_lda_model(i, clean_text)

#### What do you study?

In [None]:
pyLDAvis.display(display_lda_model('./data/open-ended/0'))

# Sentiment Analysis

In [None]:
sentiment_obj = lambda pos, neu, neg: {
  'positive': pos,
  'neutral': neu,
  'negative': neg
}

positive_vocab = ['interesting', 'informative', 'satisfied', 'learned', 'good', 'nice', 'great', 'awesome', 'well', 'fantastic', 'enjoy', 'agree']
negative_vocab = ['difficult', 'bad', 'terrible', 'useless', 'hate', 'long', 'boring']

positive_vocab = [stemmer.stem(i) for i in positive_vocab]
negative_vocab = [stemmer.stem(i) for i in negative_vocab]

positive_words = set(opinion_lexicon.positive()).union(positive_vocab)
negative_words = set(opinion_lexicon.negative()).union(negative_vocab)

In [None]:
def sentiment_of(text):
  if text is np.nan:
    return sentiment_obj(0, 0 ,0)
  
  if not text.endswith('.'):
    text += '.'

  tokenized = word_tokenize(clean(text))
  total = len(tokenized)

  pos = 0
  neg = 0
  neu = 0
  
  for t in tokenized:
    if t in positive_words:
      pos += 1
    elif t in negative_words:
      neg += 1
    else:
      neu += 1
  
  return sentiment_obj(round(pos / total, 4),
                       round(neu / total, 4),
                       round(neg / total, 4))

In [None]:
def sentiment_of_answers(answers):
  pos = []
  neu = []
  neg = []

  for a in answers:
    res = sentiment_of(a)
    pos.append(res['positive'])
    neu.append(res['neutral'])
    neg.append(res['negative'])
    
  return sentiment_obj(round(np.mean(pos), 4),
                       round(np.mean(neu), 4),
                       round(np.mean(neg), 4))

In [None]:
def store_sentiments():
  sentiments = dict()
  for q in oe_questions:
    question = oe_questions[q]
    sentiments[question] = sentiment_of_answers(survey_df[oe_questions[q]])
  
  with open(f'./data/sentiments.json', 'w+', encoding='utf-8') as f:
    json.dump(sentiments, f, indent=2)
  
  return sentiments

## The mean sentiments of answers

In [None]:
pprint(store_sentiments())