# NLP - Topic Modelling, Aspect Identification

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import gensim
from gensim.models import LdaModel, LsiModel, HdpModel
from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary
import pyLDAvis.gensim


import os, re, operator, warnings
warnings.filterwarnings('ignore')  # Let's not pay heed to them right now
%matplotlib inline


In [6]:
!pwd

/Users/smiley/PycharmProjects/Travel_Insights/notebooks


## 1. Load Data

In [2]:
# Text Corpus
text_corpus=pd.read_csv("../data/text_corpus.csv")

# NLP Processed, TODO: move this to 1st step of data
nlp_df=pd.read_csv("../data/nlp_processed_corpus.csv")


comments_df=pd.read_csv("../data/grouped_comments.csv")

In [3]:
#cleaning and merge data earlier
text_corpus.head()  #grouped_by_submission dataframe

Unnamed: 0,submission_id,submission_title,text
0,8h6aao,Wife and I hate big social events and love tra...,Wife and I hate big social events and love tra...
1,95l2e6,The exact moment I took a step too close to th...,The exact moment I took a step too close to th...
2,8yj2tg,Wandering around Kyoto at night,Wandering around Kyoto at nightKyoto is amazin...
3,8i4939,I heard this place had stunning views but I ju...,I heard this place had stunning views but I ju...
4,85awza,Went to the top of the Eiffel Tower and there ...,Went to the top of the Eiffel Tower and there ...


In [70]:
#should be a cleaned version
text_corpus['text'][0][:1000]

"wife hate social event traveling. rather normal wedding traveled switzerland vow private. dayit's strange swear wife congratulation awful stacked crate flower arrangement top.i guess type antisocial show wedding. broke sentence middle helped delivery. great. box eh? hope bridesmaids... couple? fall cliff? stacked stack stacks? congratulation marriage bonerdude username out? outside box smart. wedding garbage waste money. traveling valuable. marriage legal country? relate joke brother terrible social anxiety sil ended canceling wedding courthouse instead. idea complementing sentence structure joke figure liner charm it! try. you. sturdy. built. pounding. handle lose grip. stack down. congratulation mr. mrs. bonerdude guest wedding. emptying wine crate chatting away. drunk uncle aunt grab quick dance. party. overflow bonerdude taking dude name keeping maiden name xxsniperxx? upvote upvotes moment modern they're hyphenating. mrs. xxdude sniperxx xxsniperxx redditor saving born resist urg

In [176]:
text_corpus['text'][1]

str

In [5]:
comments_df.head()  #grouped_by_comments dataframe

Unnamed: 0,comments_id,submission_id,comment
0,dyhi85d,8h6aao,"It's strange, but I swear you and your new wif..."
1,dyhfwly,8h6aao,"That is really nice, u/bonerdude420Is she taki..."
2,dyhdpyi,8h6aao,Taken near Murren in Oct 2016. Only people the...
3,dyhi3r7,8h6aao,Way back in 2002 before destination weddings w...
4,dyhfqeg,8h6aao,We did something very similar. Got [married ou...


In [71]:
comments_df['comment'][0][:1000]

"It's strange, but I swear you and your new wife (congratulations btw), look an awful lot like stacked crates with flower arrangements on top.I guess she's my type,Or they are so antisocial that they didn't show up for their own wedding.,The way you broke up the sentence in the middle really helped with the delivery. That was great.,Nice box, eh?,I hope she’s not the mountain :/,You should have seen the bridesmaids...,Where is the couple? Did they fall off the cliff?,stacked with stacks on stacks?,congratulations on your marriage bonerdude420. username may or may not check out?,Some people just can’t think outside the box,So smart.  Wedding are such a garbage waste of money.  Traveling is so much more valuable. Is the marriage legal in your home country?,I can relate to this,You joke but my brother has really terrible social anxiety and he and my now SIL ended up canceling their wedding and just went to the courthouse instead.,I really like this idea of complementing the good sentence 

In [None]:
#!python -m spacy download en_core_web_sm

## 2. Text Cleaning

In [7]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
stopWords = stopwords.words('english')
lt = WordNetLemmatizer()

In [44]:
#Topic modelling data cleaning 

def tokenize(text, lower=True):
    if lower:
        return [token.strip().lower() for token in text.split()]
    else:
        return [token.strip() for token in text.split()]

def remove_stop_words( text):
    return [word for word in text if word not in stopWords]

def lemmatize(text):
    return [(lt.lemmatize(x)) for x in text]

def clean_text(text,token=True):
    # regex to remove URL from string
    text = re.sub('http://\S+|https://\S+', '', text)
    # regex to remove special characters from string except [: ? ! . , ']
    text = re.sub(r"[^a-zA-Z?!'.]", " ", text)

    tokenized_text = tokenize(text)
    lemmatized_text = lemmatize(tokenized_text)
    cleaned_text =remove_stop_words(lemmatized_text)
    
    if token:
        return [val for val in cleaned_text]
    else:
        return ' '.join(token for token in cleaned_text)

In [45]:
# Find words frequency.
all_words = []
for i in text_corpus['text']:
    all_words.extend(i.split())
fdist = nltk.FreqDist(all_words)

In [46]:
#list of common words
word_list=[x[0] for x in fdist.most_common()]

In [47]:
stopWords.extend([ "[REMOVED]", "[deleted]", 'lol', 'wtf', 'ha' , 'wa', "i'd", "btw" ])

stopWords.extend(word_list[:100])

In [50]:
#cleaning text_corpus
text_corpus['text']=text_corpus['text'].apply(lambda x: clean_text(x, False))

In [51]:
text_corpus.head()

Unnamed: 0,submission_id,submission_title,text
0,8h6aao,Wife and I hate big social events and love tra...,wife hate social event traveling. rather norma...
1,95l2e6,The exact moment I took a step too close to th...,exact moment step close border korea push butt...
2,8yj2tg,Wandering around Kyoto at night,wandering kyoto nightkyoto amazing. moment urb...
3,8i4939,I heard this place had stunning views but I ju...,stunning prepared this. jaw dropped. lauterbru...
4,85awza,Went to the top of the Eiffel Tower and there ...,top eiffel tower happened rainbow parislooks t...


In [48]:
#cleaning comments corpus
comments_df['tokens']= comments_df['comment'].apply(lambda x : clean_text(x))

In [49]:
comments_df.head()

Unnamed: 0,comments_id,submission_id,comment,tokens
0,dyhi85d,8h6aao,"It's strange, but I swear you and your new wif...","[strange, swear, wife, congratulation, awful, ..."
1,dyhfwly,8h6aao,"That is really nice, u/bonerdude420Is she taki...","[bonerdude, taking, dude, name, keeping, maide..."
2,dyhdpyi,8h6aao,Taken near Murren in Oct 2016. Only people the...,"[taken, murren, oct, photographer, artist, cel..."
3,dyhi3r7,8h6aao,Way back in 2002 before destination weddings w...,"[destination, wedding, chose, wedding, date, b..."
4,dyhfqeg,8h6aao,We did something very similar. Got [married ou...,"[similar., married, outdoors, sea, finland, br..."


## 3 . Identifying Aspects

### Rule based matching with "Spacy Matcher"
Defining Tagging rules https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3888732/#sec2

Rules based on part of speech tags

Pattern first word second word third word

    Pattern 1 NN amod NNP
    Pattern 2 NN NN
    Pattern 3 NN GPE
    Pattern 4 VBG ROOT VBG
   
   
Example sentences:

1. ('kayak', 'NN', 'amod', 'island', 'NN'),
2. ('climbing', 'NN', 'xcomp', 'sobbing', 'VBG'),
3. ('honeymoon', 'VBP', 'ROOT', 'honeymoon', 'VBP'),
4. ('snorkeling', 'VBG', 'ROOT', 'snorkeling', 'VBG'),



In [28]:
import spacy
from spacy.matcher import Matcher
nlp = spacy.load('en_core_web_sm')


In [29]:
matcher = Matcher(nlp.vocab)

# Pattern
pattern1 = [{'TAG': 'NN' }, {'DEP': 'amod'}, {'TAG': 'NNP' }]
pattern2 = [{'TAG': 'NN' }, {'TAG': 'NN' }]
pattern3 = [{'TAG': 'NN' } , {'ENT_TYPE':'GPE'}]
pattern4 = [{'TAG': 'VBG' }, {'DEP' : 'ROOT'}, {'TAG':'VBG' }]

# This is a list of names of patterns used. If a new pattern is created add the name of pattern to this list
patternsP=[pattern1, pattern2, pattern3, pattern4]

# add all the patterns created above
for i in patternsP: 
    matcher.add(str(i),None,i) # adds pattern

In [92]:
# generate dictionary of most frequest phrases


# reason why I am adding a set is because there are multiple patterns which are matching
sent=set()

def generate_frequent_phrases(text,submission_id):
    """
    text - the text to be analyzed, 
    """
    doc=nlp(text)
    phrases=dict() 
    # generate matches
    matches = matcher(doc) 
    
    # capturing text from matcher
    for match_id, start, end in matches:      
        sent.add(doc[start:end])
    
    phrases[submission_id]=sent
    
    return phrases
    

In [93]:
# Generate Phrases
text_corpus['phrases']=text_corpus.apply(lambda x: generate_frequent_phrases(x.text, x.submission_id), axis=1)

In [97]:
#Phrases with submission id as key and values as a set of phrases
#phrases

In [94]:
text_corpus.head()

Unnamed: 0,submission_id,submission_title,text,phrases
0,8h6aao,Wife and I hate big social events and love tra...,wife hate social event traveling. rather norma...,"{'8h6aao': {(strip, life), (shrine, market), (..."
1,95l2e6,The exact moment I took a step too close to th...,exact moment step close border korea push butt...,"{'95l2e6': {(strip, life), (shrine, market), (..."
2,8yj2tg,Wandering around Kyoto at night,wandering kyoto nightkyoto amazing. moment urb...,"{'8yj2tg': {(strip, life), (shrine, market), (..."
3,8i4939,I heard this place had stunning views but I ju...,stunning prepared this. jaw dropped. lauterbru...,"{'8i4939': {(strip, life), (shrine, market), (..."
4,85awza,Went to the top of the Eiffel Tower and there ...,top eiffel tower happened rainbow parislooks t...,"{'85awza': {(strip, life), (shrine, market), (..."


In [34]:
#to check patterns of sentences manually
#doc=nlp(text_corpus['text'][1]) 
#[(token.text, token.tag_, token.dep_, token.head.text, token.head.tag_) for token in doc]

## 4 : Topic Modelling

In [190]:
comments_df.head()

Unnamed: 0,comments_id,submission_id,comment,tokens
0,dyhi85d,8h6aao,"It's strange, but I swear you and your new wif...","[strange, swear, wife, congratulation, awful, ..."
1,dyhfwly,8h6aao,"That is really nice, u/bonerdude420Is she taki...","[bonerdude, taking, dude, name, keeping, maide..."
2,dyhdpyi,8h6aao,Taken near Murren in Oct 2016. Only people the...,"[taken, murren, oct, photographer, artist, cel..."
3,dyhi3r7,8h6aao,Way back in 2002 before destination weddings w...,"[destination, wedding, chose, wedding, date, b..."
4,dyhfqeg,8h6aao,We did something very similar. Got [married ou...,"[similar., married, outdoors, sea, finland, br..."


In [None]:
bigram = gensim.models.Phrases(comments_df.query('submission_id=="8h6aao"')['tokens'])
texts = [bigram[line] for line in comments_df.query('submission_id=="8h6aao"')['tokens']]

dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

###  LSI 

LSI stands for Latent Semantic Indeixing - it is a popular information retreival method which works by decomposing the original matrix of words to maintain key topics. Gensim's implementation uses an SVD.

In [231]:
lsimodel = LsiModel(corpus=corpus, num_topics=5, id2word=dictionary)

In [510]:
lsimodel.show_topics(num_topics=2)  # Showing only the top 2 topics

[(0,
  '0.696*"wedding" + 0.153*"married" + 0.132*"ceremony" + 0.131*"wife" + 0.113*"super" + 0.102*"dinner" + 0.093*"wedding." + 0.091*"eloped" + 0.087*"care" + 0.081*"together"'),
 (1,
  '-0.368*"name" + -0.230*"creamyanus" + -0.182*"show" + -0.163*"future." + -0.162*"tradition" + -0.141*"change" + -0.136*"cooler" + -0.110*"wedding." + -0.094*"amount" + -0.091*"large"')]

### HDP 

HDP, the Hierarchical Dirichlet Process is an unsupervised topic model which figures out the number of topics on it's own.

In [196]:
hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)

In [509]:
hdpmodel.show_topics(2)

[(0,
  "0.006*wedding + 0.004*others + 0.004*stranger + 0.003*you...it's + 0.003*week. + 0.003*angry + 0.003*lose + 0.003*stayed + 0.003*remove + 0.003*officiated + 0.003*station. + 0.003*sensible + 0.003*sticking + 0.002*mentality + 0.002*this! + 0.002*referred + 0.002*expect.better + 0.002*ha! + 0.002*portfolio + 0.002*soooooo"),
 (1,
  "0.005*wedding + 0.004*photo...the + 0.004*eloping. + 0.004*one.sounds + 0.004*stacks?it's + 0.004*candles.my + 0.003*community + 0.003*nights. + 0.003*hurt + 0.003*sea + 0.003*poor + 0.003*detail + 0.003*attitude + 0.003*whether + 0.003*weird + 0.003*her. + 0.003*married + 0.003*grab + 0.002*accept + 0.002*dinner")]

### LDA

LDA, or Latent Dirichlet Allocation is arguably the most famous topic modelling algorithm out there. Out here we create a simple topic model with 10 topics.

In [198]:
ldamodel = LdaModel(corpus=corpus, num_topics=5, id2word=dictionary)

In [200]:
for idx, topic in ldamodel.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: nan*"slightest" + nan*"so..." + nan*"riddled" + nan*"rule" + nan*"sense" + nan*"sig" + nan*"it'll" + nan*"trough" + nan*"std" + nan*"ha!"


Topic: 1 
Words: nan*"slightest" + nan*"so..." + nan*"riddled" + nan*"rule" + nan*"sense" + nan*"sig" + nan*"it'll" + nan*"trough" + nan*"std" + nan*"ha!"


Topic: 2 
Words: nan*"slightest" + nan*"so..." + nan*"riddled" + nan*"rule" + nan*"sense" + nan*"sig" + nan*"it'll" + nan*"trough" + nan*"std" + nan*"ha!"


Topic: 3 
Words: nan*"slightest" + nan*"so..." + nan*"riddled" + nan*"rule" + nan*"sense" + nan*"sig" + nan*"it'll" + nan*"trough" + nan*"std" + nan*"ha!"


Topic: 4 
Words: nan*"slightest" + nan*"so..." + nan*"riddled" + nan*"rule" + nan*"sense" + nan*"sig" + nan*"it'll" + nan*"trough" + nan*"std" + nan*"ha!"




### SVD & Tfidf vectorizer

In [203]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words=stopWords, 
                             max_features= 1000, # keep top 1000 terms 
                             max_df = 0.5, 
                             smooth_idf=True)

X = vectorizer.fit_transform(comments_df.query('submission_id=="8h6aao"')['comment'])

X.shape # check shape of the document-term matrix

(172, 1000)

In [204]:
from sklearn.decomposition import TruncatedSVD

# SVD represent documents and terms in vectors 
svd_model = TruncatedSVD(n_components=5, algorithm='randomized', n_iter=100, random_state=122)

svd_model.fit(X)

len(svd_model.components_)

5

In [205]:
terms = vectorizer.get_feature_names()

for i, comp in enumerate(svd_model.components_):
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:7]
    print("Topic "+str(i)+": ")
    for t in sorted_terms:
        print(t[0])
    print(" ")

Topic 0: 
wedding
friends
us
watching
weddings
married
share
 
Topic 1: 
watching
share
media
social
moment
giant
happen
 
Topic 2: 
shoot
honestly
watching
fun
selective
conflict
giants
 
Topic 3: 
sarcasm
drama
queen
fuck
taking
seriously
sarcastic
 
Topic 4: 
makeup
done
marrying
vain
women
professionally
invite
 


In [None]:
"""
# Train model and save the model results.

#pickle API for serializing standard Python objects
import pickle
pickle.dump(ldamodel, open("PATH/model/lda.pickle", "wb"))
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, Y_test)

#joblib API for efficiently serializing Python objects with NumPy arrays.
from sklearn.externals import joblib
joblib.dump(ldamodel, 'ldamodel.pkl')
loaded_model=open('ldamodel.pkl','rb')
topic_model = joblib.load(loaded_model)

"""

In [138]:
def preprocess(text):
    text_tokens= text.split()
    text_tokens=[i.strip().lower() for i in text_tokens]
    text_tokens=[[i for i in text_tokens if i not in stopWords]]
    return text_tokens

###  pyLDAvis

Using this library to visualise our topic models. What make pyLDAvis special is that it gives a UI to user (top right in below diagram) that allow to set the value of lambda which lambda values 0 - Negative and 1- Positive. One can see from best to worst topics

In [None]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

## 5. Best Topic Model - LSI

In [511]:

def generate_topic(df):
    bigram = gensim.models.Phrases(df)
    #texts
    texts = [bigram[line] for line in df]
    
    dictionary = Dictionary(texts)
    
    corpus = [dictionary.doc2bow(text) for text in texts]
    
    lsimodel = LsiModel(corpus=corpus, num_topics=5, id2word=dictionary)
    results=lsimodel.print_topics()
    
    topics=list()
    for i, topic in enumerate(results):
        out=[]
        for word in topic[1].split('+'): 
            res=word.split('*')
            out.append(( float(res[0]), res[1]))
        sorted_list=sorted(out, key=lambda x: x[0], reverse=True)
        topics.extend([i[1].strip()[1:-1]for i in sorted_list[:5]])
    return ','.join([i for i in topics])

In [512]:
topics_dict={"submission_id":[],"topics":[]}
topics_dict["submission_id"]=list(text_corpus.submission_id)

In [513]:
for sub_id in topics_dict["submission_id"]:
    topics_dict["topics"].append(generate_topic(comments_df.query('submission_id=="{0}"'.format(sub_id))['tokens']))

In [514]:
topic_df=pd.DataFrame(topics_dict)

In [515]:
topic_df.head(3)

Unnamed: 0,submission_id,topics
0,8h6aao,"wedding,married,ceremony,wife,super,name,cream..."
1,95l2e6,"nk,korean,guard,sk,border,called,auto,?,team,r..."
2,8yj2tg,"kyoto,safe,kyoto.,tokyo,japanese,us.,president..."


In [516]:
cnt=1
for i in range(0,len(a),5):    
    print(cnt, a[i:i+5])
    cnt+=1

1 ['wedding', 'married', 'ceremony', 'wife', 'super']
2 ['large', 'amount', 'wedding.', 'cooler', 'change']
3 ['paperwork', 'legal', 'care', 'piece_paper', 'italy']
4 ['wedding', 'parent', 'saying', 'married', 'fight']
5 ['wedding.', 'legal', 'australia', 'similar', 'spain']


In [517]:
topic_df.to_csv("./topics_by_submissionid.csv", index=False)

## Chiku's Part

In [67]:
#!/usr/bin/env python3
# Required libraries.
import pandas as pd
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import pickle
import gensim
import time
from gensim import corpora, models, similarities

import logging

In [None]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
stopWords = set(stopwords.words('english'))
lt = WordNetLemmatizer()

In [8]:
# Set up tokenizer.
tokenizer = RegexpTokenizer(r'\w+')

# Set up stop words.
stop = set(stopwords.words('english'))

# Set up stemmer.
p_stemmer = PorterStemmer()


In [9]:
def LDA_Clean_Text(review_main_text_list):
    # List for the review texts that are tokenized, stop word deleted and stemmed.
    cleaned_up_review_list = []
    # For every review.
    for document in review_main_text_list:
        # Use the lowercase of all letters.
        raw = document.lower()
        # Tokenization
        tokens = tokenizer.tokenize(raw)
        # Manually deleting stop words.
        j = 0

        while j < len(tokens):
            if tokens[j] in stop:
                del tokens[j]
            else:
                j += 1
        # Stem each word.
        #cleaned_text = [p_stemmer.stem(i) for i in tokens]
        cleaned_text = [i for i in tokens]
        # Add cleaned review text to list.
        cleaned_up_review_list.append(cleaned_text)

    return cleaned_up_review_list


In [38]:
def LDA():
    """
    Generate Topics from the reviews corpora
    :param reviews: Reviews is a dataset of text reviews for doctors based on location_id
    :return:
    """
    # Creating a list of all reviews
    #review_main_text_list = [review for review in df.query('submission_id=="8h6aao"')['comment']]
    review_main_text_list = [review for review in df.query('submission_id=="8h6aao"')['comment']]

    # Step 2: Cleaning the reviews text
    cleaned_up_review_list = LDA_Clean_Text(review_main_text_list)

    # Checking word frequency with NLTK
    all_words = []
    for i in cleaned_up_review_list:
        for j in i:
            all_words.append(j)

    # frequency distribution of words
    fdist = nltk.FreqDist(all_words)

    # Print the most common words.
    print(fdist.most_common())

    # Step3: Filter out the words that we want to ignore.
    #cleaned_up_review_list2 = LDA_words_to_ignore(cleaned_up_review_list)

    # Generate dictionary and corpus from the remaining words.
    dictionary = gensim.corpora.Dictionary(cleaned_up_review_list)
    corpus = [dictionary.doc2bow(word) for word in cleaned_up_review_list]

    # Step 4: Generate LDA Model
    # Setting parameters for LDA:
    no_of_topics = 15
    passes_in = 100
    ldamodel = Train_LDA(corpus, dictionary, no_of_topics, passes_in)

    # Step 5: Check resulting topics.
    topic_list = ldamodel.print_topics(num_topics=no_of_topics, num_words=25)

    for index, i in enumerate(topic_list):
        str1 = str(i[1])
        for c in "0123456789+*\".":
            str1 = str1.replace(c, "")
        str1 = str1.replace("  ", " ")
        print(str1)

In [42]:
def Train_LDA(corpus, dictionary, no_of_topics=15, passes_in=10):
    # Step 4: Train LDA model.
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=no_of_topics, id2word=dictionary, passes=passes_in,
                                               alpha='asymmetric')
    # Saving the results
    #pickle.dump(ldamodel, open("../model/lda3.pickle", "wb"))
    #pickle.dump(dictionary, open("../model/dictionary3.pickle", "wb"))
    #pickle.dump(corpus, open("../model/corpus3.pickle", "wb"))

    return ldamodel

In [36]:
type(stop)

set

In [None]:
stopWords.extend([[REMOVED], [deleted], 'lol', 'wtf', 'ha' , 'wa', "i'd", ])

stopWords.extend(word_list)

In [None]:
class CleaningText:
    def tokenize(self, text, lower=True):
        if lower:
            return [token.strip().lower() for token in text.split()]
        else:
            return [token.strip() for token in text.split()]

    def remove_stop_words(self, text):
        return [word for word in text if word not in stopWords]

    def lemmatize(self, text):
        return [(lt.lemmatize(x)) for x in text]

    def clean_text(self, text):
        # regex to remove URL from string
        text = re.sub('http://\S+|https://\S+', '', text)
        # regex to remove special characters from string except [: ? ! . , ']
        text = re.sub(r"[^a-zA-Z?!'.]", " ", text)

        tokenized_text = self.tokenize(text, True)
        lemmatized_text = self.lemmatize(tokenized_text)
        cleaned_text = self.remove_stop_words(lemmatized_text)

        return ' '.join(token for token in cleaned_text)

In [21]:
df = pd.read_csv("../data/aggregated_data/group_by_comments.csv")
# 8h6aao

In [22]:
df.submission_id[0]

'8h6aao'

In [28]:
def clean_data():
    # Step 1: Clean the input data
    # Loading the dataset
    df = pd.read_csv("../data/aggregated_data/group_by_comments.csv")
    print(df.columns)
    # Dropping unwanted column
    #data.drop(data.columns[[0]], axis=1, inplace=True)

    # Taking subset of columns
    #df = data[['Location', 'Description', 'Rating']]

    # drop na values
    df = df.dropna()
    # resetting index after deleting NA rows
    df = df.reset_index(drop=True)

    # Call the LDA Function
    LDA()

In [100]:
#clean_data()