In [1]:
import pandas as pd
import gensim #the library for Topic modelling
from gensim.models.ldamulticore import LdaMulticore
from gensim import corpora, models
import pyLDAvis.gensim #LDA visualization library

from nltk.corpus import stopwords
import string
from nltk.stem.wordnet import WordNetLemmatizer

import warnings
warnings.simplefilter('ignore')
from itertools import chain

In [2]:
def prepare_df(df):
    df["topic"] = df[df["text"].isin(topic)]

    df[["speaker", 'text']] = df["text"].str.split(":", expand=True)
    df["topic"] = df["topic"].fillna(method="ffill")
    df = df[df["speaker"].str.contains("P")]
    df = df[['speaker', 'topic', 'text']].reset_index(drop=True)
    
    return df.dropna()


def read_text():
    return pd.read_table("data/session-1-transcribed-annotated.txt",
                         header=None,
                         names=["text"]) 

def strip_spaces(df):
    df['text'] = df["text"].str.strip()
    return df

In [35]:
topic = ["USAGE", "TECHNOLOGY", "EMOTION", "COMFORT", "EXPERIENCE", "SATISFACTION"]

df = read_text()
df = prepare_df(df)
df = strip_spaces(df)

In [76]:
#clean the data
stop = set(stopwords.words('english'))
stop.update(["im", "yeah", "like", "yes", "maybe", "could", "also", "is", "mainly", "i'm"])
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()


def clean(text):
    stop_free = ' '.join([word for word in text.lower().split() if word not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = ' '.join([lemma.lemmatize(word) for word in punc_free.split()])
    
    return normalized.split()

In [75]:
for word in df['text'][130].lower().split():
    if word not in stop:
        print(word)

feel
cozy.
chairs
staring
at,
kind
walls.
little
table.
think
cushions
comfortable.


In [77]:
df["clean_text"] = df['text'].apply(clean)

In [78]:
def create_dictionary(df):
    dictionary = corpora.Dictionary(df['clean_text'])
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in df['clean_text'] ]
    
    print("Total no of Non Zeros in BOW:", dictionary.num_nnz)
    print("Length of document matrix:", len(doc_term_matrix))

    return dictionary, doc_term_matrix

In [79]:
df

Unnamed: 0,speaker,topic,text,clean_text
0,P*,USAGE,Mainly groups working together as it's an open...,"[group, working, together, open, environment, ..."
1,P*,USAGE,I also like quite a non-stressy work. Like jus...,"[quite, nonstressy, work, catchup, stuff, that]"
2,P,USAGE,You could also use it for a lunch or something.,"[use, lunch, something]"
3,P,TECHNOLOGY,How smart do you think this place is? How do y...,"[smart, think, place, is, think, technology, i..."
4,P,TECHNOLOGY,"It's not really smart, I think. There are some...","[really, smart, think, socket, know, corner, s..."
...,...,...,...,...
131,P*,SATISFACTION,Cottage like. Cottage like. Which is cozy. But...,"[cottage, like, cottage, like, cozy, specific,..."
132,P*,SATISFACTION,"I feel at peace, this space is I know Roomy.","[feel, peace, space, know, roomy]"
133,P,SATISFACTION,For me it's not comfortable so… uncomfortable.,"[comfortable, so…, uncomfortable]"
134,P,SATISFACTION,"Yeah, I just …the light…[from the artificial s...","[yeah, …the, light…from, artificial, spotlight..."


In [80]:
#create dictionary
dictionary, doc_term_matrix = create_dictionary(df)

Total no of Non Zeros in BOW: 1446
Length of document matrix: 130


In [84]:
lda = gensim.models.ldamodel.LdaModel
num_topics=3
ldamodel = lda(doc_term_matrix,
               num_topics=num_topics,
               id2word=dictionary,
               passes=50,
               minimum_probability=0)

In [85]:
ldamodel.print_topics(num_topics=3)

[(0,
  '0.035*"yeah" + 0.024*"space" + 0.018*"feel" + 0.015*"okay" + 0.012*"wall" + 0.010*"think" + 0.010*"feeling" + 0.010*"it" + 0.010*"yes" + 0.010*"one"'),
 (1,
  '0.035*"yeah" + 0.027*"think" + 0.020*"see" + 0.016*"space" + 0.016*"really" + 0.016*"know" + 0.014*"there" + 0.014*"bit" + 0.012*"people" + 0.012*"something"'),
 (2,
  '0.027*"feel" + 0.026*"think" + 0.021*"yeah" + 0.016*"bit" + 0.013*"people" + 0.013*"would" + 0.013*"kind" + 0.012*"place" + 0.011*"here" + 0.011*"nice"')]

In [86]:
lda_display = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)