In [196]:
import pandas as pd

import gensim
import gensim #the library for Topic modelling
from gensim.models.ldamulticore import LdaMulticore
from gensim import corpora, models
import pyLDAvis.gensim #LDA visualization library

from nltk.corpus import stopwords
import string
from nltk.stem.wordnet import WordNetLemmatizer

import warnings
warnings.simplefilter('ignore')
from itertools import chain

# Data Preparation

In [197]:
def prepare_df(df):
    df["topic"] = df[df["text"].isin(topic)]

    df[["speaker", 'text']] = df["text"].str.split(":", expand=True)
    df["topic"] = df["topic"].fillna(method="ffill")
    df = df[df["speaker"].str.contains("P")]
    df = df[['speaker', 'topic', 'text']].reset_index(drop=True)
    
    return df.dropna()


def read_text():
    return pd.read_table("data/session-1-transcribed-annotated.txt",
                         header=None,
                         names=["text"]) 

def strip_spaces(df):
    df['text'] = df["text"].str.strip()
    return df

In [198]:
topic = ["USAGE", "TECHNOLOGY", "EMOTION", "COMFORT", "EXPERIENCE", "SATISFACTION"]

df = read_text()
df = prepare_df(df)
df = strip_spaces(df)

In [227]:
#clean the data
stop = set(stopwords.words('english'))
stop.update(["im", "yeah", "like", "yes", "maybe", "could", "also", "is", "mainly", "the", "sorry", "oh"])
exclude = set(string.punctuation)
stop.update(["..."])

lemma = WordNetLemmatizer()


def clean(text):
    punc_free = ''.join(ch for ch in text.lower() if ch not in exclude)
    stop_free = ' '.join([word for word in punc_free.lower().split() if word not in stop])
    normalized = ' '.join([lemma.lemmatize(word) for word in stop_free.split()])
    
    return normalized.split()

In [228]:
df["clean_text"] = df['text'].apply(clean)

In [229]:
def create_dictionary(df):
    dictionary = corpora.Dictionary(df['clean_text'])
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in df['clean_text'] ]
    
    print("Total no of Non Zeros in BOW:", dictionary.num_nnz)
    print("Length of document matrix:", len(doc_term_matrix))

    return dictionary, doc_term_matrix

In [230]:
#create dictionary
dictionary, doc_term_matrix = create_dictionary(df)

Total no of Non Zeros in BOW: 1227
Length of document matrix: 121


In [231]:
lda = gensim.models.ldamodel.LdaModel
num_topics=16
ldamodel = lda(doc_term_matrix,
               num_topics=num_topics,
               id2word=dictionary,
               passes=50,
               minimum_probability=0)

In [232]:
# ldamodel.print_topics(num_topics=num_topics)

In [233]:
lda_display = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)

# Articles in Relation to Clusters

In [206]:
lda_corpus = ldamodel[doc_term_matrix]

In [181]:
# [doc for doc in lda_corpus]

In [207]:
scores = list(chain(*[[score for topic_id,score in topic] \
                      for topic in [doc for doc in lda_corpus]]))

threshold = sum(scores)/len(scores)
print(threshold)

0.06250000038436299


In [208]:
df = df.reset_index(drop=False)

In [219]:
cluster1 = [j for i,j in zip(lda_corpus,df.index) if i[0][1] > threshold]
cluster2 = [j for i,j in zip(lda_corpus,df.index) if i[1][1] > threshold]
cluster3 = [j for i,j in zip(lda_corpus,df.index) if i[2][1] > threshold]
cluster4 = [j for i,j in zip(lda_corpus,df.index) if i[3][1] > threshold]
cluster5 = [j for i,j in zip(lda_corpus,df.index) if i[4][1] > threshold]
cluster6 = [j for i,j in zip(lda_corpus,df.index) if i[5][1] > threshold]
cluster7 = [j for i,j in zip(lda_corpus,df.index) if i[6][1] > threshold]
cluster8 = [j for i,j in zip(lda_corpus,df.index) if i[7][1] > threshold]
cluster9 = [j for i,j in zip(lda_corpus,df.index) if i[8][1] > threshold]
cluster10 = [j for i,j in zip(lda_corpus,df.index) if i[9][1] > threshold]
cluster11 = [j for i,j in zip(lda_corpus,df.index) if i[10][1] > threshold]
cluster12 = [j for i,j in zip(lda_corpus,df.index) if i[11][1] > threshold]
cluster13 = [j for i,j in zip(lda_corpus,df.index) if i[12][1] > threshold]
cluster14 = [j for i,j in zip(lda_corpus,df.index) if i[13][1] > threshold]
cluster15 = [j for i,j in zip(lda_corpus,df.index) if i[14][1] > threshold]
cluster16 = [j for i,j in zip(lda_corpus,df.index) if i[15][1] > threshold]

print(len(cluster1))
print(len(cluster2))
print(len(cluster3))
print(len(cluster4))
print(len(cluster5))
print(len(cluster6))
print(len(cluster7))
print(len(cluster8))
print(len(cluster9))
print(len(cluster10))
print(len(cluster11))
print(len(cluster12))
print(len(cluster13))
print(len(cluster14))
print(len(cluster15))
print(len(cluster16))

9
10
8
1
10
9
10
11
12
6
5
14
3
6
6
5


In [210]:
df.iloc[cluster1]

Unnamed: 0,index,speaker,topic,text,clean_text
10,10,P,TECHNOLOGY,"That's the only usage. Yeah, it's wonderful. H...","[thats, usage, wonderful, knew, many, people, ..."
17,17,P,EMOTION,"Yeah, work on project or something that's the ...","[work, project, something, thats, feeling, get..."
22,22,P,EMOTION,I actually feel a bit watched because of the b...,"[actually, feel, bit, watched, balcony, theyre..."
32,32,P*,COMFORT,Okay. There is also a little bit of stuffienss...,"[okay, little, bit, stuffienss, lot, there, li..."
39,40,P*,SATISFACTION,"I'm feeling restless with this face, because I...","[feeling, restless, face, feeling, anxiety, so..."
46,47,P,USAGE,"Yeah, then it's not a problem.",[problem]
69,70,P*,EMOTION,"Kind of separating the little... Isolating, ma...","[kind, separating, little, isolating]"
110,113,P*,COMFORT,It can also be like a very thin meshy kind of ...,"[thin, meshy, kind, fabric]"
120,125,P,SATISFACTION,I'm feeling familiar with this place and famil...,"[feeling, familiar, place, familiar, relaxing,..."


In [211]:
df.iloc[cluster2]

Unnamed: 0,index,speaker,topic,text,clean_text
0,0,P*,USAGE,Mainly groups working together as it's an open...,"[group, working, together, open, environment, ..."
4,4,P*,TECHNOLOGY,And on the floor? Yeah. And maybe I should go ...,"[floor, go, last, know, lot, technology, place]"
5,5,P*,TECHNOLOGY,First answer doesn't really look like there's ...,"[first, answer, doesnt, really, look, there, m..."
14,14,P*,EMOTION,"Well, I feel anxious because of the amount of ...","[well, feel, anxious, amount, background, nois..."
19,19,P,EMOTION,"yeah, I don't necessarily feel anxious but I d...","[dont, necessarily, feel, anxious, feel, distr..."
48,49,P,TECHNOLOGY,"A sensor beneath it wouldn't be that useful, b...","[sensor, beneath, wouldnt, useful, wouldnt, mo..."
90,93,P*,TECHNOLOGY,There is a motion sensor. Like the little bloc...,"[motion, sensor, little, block, end, cable, ri..."
91,94,P,TECHNOLOGY,It's looks like air freshener or something. Ye...,"[look, air, freshener, something, thats, somet..."
105,108,P,COMFORT,"Yeah, I could also not study here because ther...","[study, lecture, people, would, come, use, mac..."
112,115,P*,COMFORT,I do like that little boxy thing here. It feel...,"[little, boxy, thing, feel, homie, something, ..."


In [212]:
df.iloc[cluster3]

Unnamed: 0,index,speaker,topic,text,clean_text
2,2,P,USAGE,You could also use it for a lunch or something.,"[use, lunch, something]"
24,24,P,EMOTION,I like the plants here. Some are dying but sti...,"[plant, dying, still, add, something]"
36,37,P,EXPERIENCE,"For me, it's the lighting because it's really ...","[lighting, really, bright, room, dont, brightn..."
40,41,P,SATISFACTION,I'm also feeling content with this face right ...,"[feeling, content, face, right, even, noise, o..."
43,44,P,USAGE,I would also imagine that people just sit here...,"[would, imagine, people, sit, calm, separate, ..."
62,63,P,EMOTION,"Ah, alright. Maybe just more gray or something...","[ah, alright, gray, something, relaxing]"
73,74,P,COMFORT,And I think I'm distracted by it because I'm t...,"[think, distracted, thinking, use, coffee, rel..."
79,80,P,COMFORT,"Okay. Yeah, also the idea that noise is less.....","[okay, idea, noise, le, le, distracted, expose..."


In [213]:
df.iloc[cluster4]

Unnamed: 0,index,speaker,topic,text,clean_text
119,124,P,SATISFACTION,"Yeah, I just …the light…[from the artificial s...","[…the, light…from, artificial, spotlight, part..."


In [214]:
df.iloc[cluster5]

Unnamed: 0,index,speaker,topic,text,clean_text
8,8,P*,TECHNOLOGY,"Well, there are sensors underneath the tables ...","[well, sensor, underneath, table, see, people,..."
12,12,P*,TECHNOLOGY,Is there also something for like the daylight?...,"[something, daylight, thats, nice, building, r..."
20,20,P,EMOTION,"Yeah, for fooling around or something, this wo...","[fooling, around, something, would, fun]"
44,45,P,USAGE,"And I could also see myself work here, because...","[see, work, look, bit, outside, chair, large, ..."
53,54,P*,EMOTION,"Yes. You can feel... Calm. Again, though, but ...","[feel, calm, though, rest, something]"
65,66,P*,EMOTION,Because that's generally quite calming when it...,"[thats, generally, quite, calming, storm, blue..."
66,67,P*,EMOTION,"Again, I already asked these, but they identif...","[already, asked, identify, specific, object, m..."
85,88,P,USAGE,"I think it was in March, some like three month...","[think, march, three, month, ago, think, watch..."
93,96,P,TECHNOLOGY,I think it has a positive impact on your emoti...,"[think, positive, impact, emotion, well, feel,..."
116,121,P*,SATISFACTION,Cottage like. Cottage like. Which is cozy. But...,"[cottage, cottage, cozy, specific, type, cozy,..."


In [215]:
df.iloc[cluster6]

Unnamed: 0,index,speaker,topic,text,clean_text
13,13,P,TECHNOLOGY,There are also no blinds there.,[blind]
30,30,P,COMFORT,I would like the seating. I just have to watch...,"[would, seating, watch, valid, point, guess, t..."
34,35,P*,EXPERIENCE,Noise. Noise. Okay. Positive or negative? For ...,"[noise, noise, okay, positive, negative, negat..."
35,36,P,EXPERIENCE,I think the plants are really healthy. Okay. S...,"[think, plant, really, healthy, okay, plant, r..."
49,50,P,TECHNOLOGY,"There are probably some sensors, I guess, like...","[probably, sensor, guess, temperature, humidit..."
51,52,P,TECHNOLOGY,I hope it's also there. Fire sensor? Fire sens...,"[hope, fire, sensor, fire, sensor, see, saw, o..."
72,73,P*,COMFORT,"Yeah, I guess you. Okay.","[guess, okay]"
94,97,P,TECHNOLOGY,Yes. I am feeling happy. I guess.,"[feeling, happy, guess]"
113,116,P*,COMFORT,Why not like put some old journals and stuff i...,"[put, old, journal, stuff, make, homie, whatev..."
