In [2]:
#!pip install google-cloud-translate
#!pip install --upgrade google-cloud-translate --force-reinstall
#reset session
#!pip install requests
#reset session
#!pip install gensim

In [3]:
from google.cloud import translate                                                                                 #lib to use google translation api
from nltk.tokenize import word_tokenize                                                                            #lib to use tokenize function of nltk
from nltk.corpus import stopwords                                                                                  #lib to use stop_words corpus of nltk
from nltk.stem import PorterStemmer                                                                                #lib to use Porter stemmer for stemming
import pandas as pd                                                                                                #lib to work with dataframes

In [4]:
data_raw = pd.read_csv('/content/datalab/data_csv/data.csv')                                                       #reading data.csv using pandas
#comments = data_raw["Comment"].tolist()                                                                           #converting comments in the csv file to list format for further computation
head = data_raw.head(573)
#head_list = head["Comment"].tolist()
#comments = data_raw["Comment"]

In [5]:
translated_comments = []          
def translate_text(text,target='en'): 
    """
    Function to translate the comments
    
    args: text: str
          target: str, default='en'
    """
    text = text.decode('utf-8')                                                                    
    translate_client = translate.Client()                                                                             #creating an object for the translation api
    result = translate_client.translate(text,target_language=target)                                                  #using the translation object
    translated_comments.append(result)                                                                                #storing results in translated_comments

In [6]:
_ = head["Comment"].apply(translate_text)                                                                             #applying the function to full columns of comments

In [7]:
temp = pd.DataFrame(translated_comments)                                                                              #converting the dictionary(from google translation api) obtained to dataFrame

In [8]:
def tokenize(string): 
  """
  Function to tokenize the comments
    
  args: text: str
  """
  tokenize_text = word_tokenize(string)
  return tokenize_text
temp["translatedText"] = temp["translatedText"].apply(tokenize)

In [9]:
def lower(strings):  
  """
  Function to set the strings of translatedText column in lower case
    
  args: text: str
  """
  stem_text = [string.lower() for string in strings]
  return stem_text
temp["translatedText"] = temp["translatedText"].apply(lower)

In [10]:
stop_words = set(stopwords.words('english'))

def stop_words_remove(strings):
  """
  Function to remove the regular english stop words
    
  args: text: str
  """
  stop = [string for string in strings if string not in stop_words]
  return stop
temp["translatedText"] = temp["translatedText"].apply(stop_words_remove) 

In [11]:
my_stop_words = [u'@',u',',u'.',u'quiqu',u'🙃',u'😍😢',u'😘😍👌',u'💜',u'i',u'🇨🇱',u'🔝🔝🔝',u'?',u'😍😍😍😍👏👏👏👏',u'😔',u'#',u'❤😍',u'🇨🇦',u'😍🌹😍🌹😍🌹😍🌹😍🌹😍🌹😍🌹😍🌹😍🌹😍🌹😍🌹😍🌹😍🌹😍🌹😍🌹😍🌹',u'😍😍😍❤️',u'😊',u'😭😭❤️❤️',u'🤣',u'❤️👏🏼',u'🤤',u'💜💜💜',u'🔊',u'❤️💘',u'\ufe0f',u'👏👏',u'🔥🔥🔥🔥🔥',u'😍😍😍🔝🔝❤️',u'❄️🇨🇦❄️',u'👇🏼🙅',u';',u'✔',u'!',u'&',u'i',u'39',u'*',u'much',u'',u'💚',u'☹️',u'❤️🖤',u'give',u'know',u'like',u'see',u'mona❤️❤️❤️',u'jo',u'awwww😍',u'🌹',u'hermosas😍',u'...',u':',u'(', u')',u'..',u'😍',u'😍😍',u'😍😍😍',u'♥',u'❤️',u'😘',u'💖',u'😂',u'love',u'💙',u'beauti',u'❤️🔥',u'quot',u'❤️❤️❤️❤️',u'quiqu',u'😭😭',u'❤️❤️❤️',u'😍😍😍😍',u'❤',u'💕',u'i',u'8',u'',u'would',u'😍❤️',u'😭',u'😂😂']

def my_stop_words_remove(strings):
  """
  Function to remove the customised stop words
    
  args: text: str
  """
  stop = [string for string in strings if string not in my_stop_words]
  return stop
temp["translatedText"] = temp["translatedText"].apply(my_stop_words_remove)

In [13]:
ps = PorterStemmer()                                                                                                  #object for the PorterStemmer
def stemmer(strings):  
  """
  Function to stem the tokenize strings of comments
    
  args: text: str
  """
  stem_text = [ps.stem(string) for string in strings]  
  return stem_text
temp["translatedText"] = temp["translatedText"].apply(stemmer)

In [14]:
processed_comments = temp.translatedText                                                                               #final processed comments

In [31]:
import gensim                                                                                                          #lib to apply topic modelling                                               
import gensim.corpora.dictionary                                                                                       #lib to map between words and their integer ids
dictionary = gensim.corpora.Dictionary(processed_comments)
dictionary.save('dictionary.dict')

In [33]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_comments]                                                   #converting dictionary to matrix
corpora.MmCorpus.serialize('corpus.mm', bow_corpus)

In [34]:
lda_model_bow = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary)                              #applying LDA using bag_of_words
#print(lda_model_bow)
lda_model_bow.save('topic.model')

In [35]:
#for i in range(0, lda_model_bow.num_topics):                                                                           #printing different topics in the document
#  print lda_model_bow.print_topic(i) 
#lda_model_bow.show_topics()

In [36]:
from gensim import corpora, models                                                                                      #lib to import modules to use tf,idf models

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=5, id2word=dictionary)                            #applyting LDA using tf,idf

In [37]:
#for i in range(0, lda_model_tfidf.num_topics):                                                                          #printing different topics in the document
# print lda_model_tfidf.print_topic(i)
#lda_model_tfidf.show_topics()

In [41]:
#!pip install pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

In [43]:
d = gensim.corpora.Dictionary.load('dictionary.dict')
c = gensim.corpora.MmCorpus('corpus.mm')
lda = gensim.models.LdaModel.load('topic.model')
data=pyLDAvis.gensim.prepare(lda,c,d)

In [44]:
data