In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
import gensim

paramiko missing, opening SSH/SCP/SFTP paths will be disabled.  `pip install paramiko` to suppress


In [2]:
data = pd.read_csv('only_translated.txt')

In [3]:
data = data['Translated'].dropna()
data.shape

(408602,)

In [4]:
import nltk
from nltk import FreqDist

In [5]:
def freq_words(x, terms = 30):
  all_words = ' '.join([text for text in x])
  all_words = all_words.split()
  
  fdist = FreqDist(all_words)
  words_df = pd.DataFrame({'word':list(fdist.keys()), 'count':list(fdist.values())})
  
  # selecting top 20 most frequent words
  d = words_df.nlargest(columns="count", n = terms) 
  plt.figure(figsize=(20,5))
  ax = sns.barplot(data=d, x= "word", y = "count")
  ax.set(ylabel = 'Count')
  plt.show()

In [6]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/marvin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
# function to remove stopwords
def remove_stopwords(rev):
  rev_new = " ".join([i for i in rev if i not in stop_words])
  return rev_new

In [8]:
# remove short words (length < 3)
data = data.apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))

# remove stopwords from the text
data = [remove_stopwords(r.split()) for r in data]

# make entire text lowercase
data = [r.lower() for r in data]

In [9]:
nlp = spacy.load('en', disable=['parser', 'ner'])
def lemmatization(texts, tags=['NOUN', 'ADJ']):
    output = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        output.append([token.lemma_ for token in doc if token.pos_ in tags])
    return output

In [10]:
tokenized_data = pd.Series(data).apply(lambda x: x.split())
print(tokenized_data[0])

['can', 'comments', 'used?', 'can', 'still', 'used', 'before,', 'obviously', 'now?']


In [11]:
data_2 = lemmatization(tokenized_data)
print(data_2[0])

['comment']


In [12]:
# de-tokenization
detokenized_doc = []
for i in range(len(data_2)):
    t = ' '.join(data_2[i])
    detokenized_doc.append(t)
    
data_2 = detokenized_doc

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', 
                             max_features= 1000, # keep top 1000 terms 
                             max_df = 0.5, 
                             smooth_idf=True)

X = vectorizer.fit_transform(data_2)

X.shape # check shape of the document-term matrix

(408602, 1000)

In [14]:
from sklearn.decomposition import TruncatedSVD

# SVD represent documents and terms in vectors 
svd_model = TruncatedSVD(n_components=20, algorithm='randomized', n_iter=100, random_state=122)

svd_model.fit(X)

len(svd_model.components_)

20

In [15]:
terms = vectorizer.get_feature_names()

for i, comp in enumerate(svd_model.components_):
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:7]
    print("Topic "+str(i)+": ")
    for t in sorted_terms:
        print(t[0])
    print(" ")

Topic 0: 
good
app
application
map
navigation
great
gps
 
Topic 1: 
app
great
nice
excellent
useful
map
love
 
Topic 2: 
nice
good
aap
working
root
graphic
updating
 
Topic 3: 
excellent
application
nice
useful
map
thank
time
 
Topic 4: 
great
nice
excellent
application
good
help
program
 
Topic 5: 
helpful
map
application
useful
thank
easy
accurate
 
Topic 6: 
useful
application
map
use
time
easy
route
 
Topic 7: 
map
application
update
time
love
route
use
 
Topic 8: 
application
super
love
app
time
gps
update
 
Topic 9: 
love
waze
use
easy
time
traffic
accurate
 
Topic 10: 
super
love
useful
excellent
great
navigation
easy
 
Topic 11: 
time
use
update
bad
easy
route
gps
 
Topic 12: 
awesome
easy
use
thank
accurate
super
great
 
Topic 13: 
use
easy
comment
accurate
application
map
app
 
Topic 14: 
bad
easy
use
map
awesome
love
application
 
Topic 15: 
cool
bad
thank
program
thing
bro
lot
 
Topic 16: 
thank
bad
lot
time
route
road
traffic
 
Topic 17: 
update
gps
thank
location
signal
w

In [16]:
import umap

X_topics = svd_model.fit_transform(X)
embedding = umap.UMAP(n_neighbors=150, min_dist=0.5, random_state=12).fit_transform(X_topics)

plt.figure(figsize=(7,5))
plt.scatter(embedding[:, 0], embedding[:, 1], 
c = dataset.target,
s = 10, # size
edgecolor='none'
)
plt.show()

AttributeError: module 'umap' has no attribute 'UMAP'

In [17]:
# get document_topic_matrix with SVD
document_topic_matrix = svd_model.transform(X)

document_topic_matrix.shape #check shape of the document-topic matrix

(408602, 20)

(408648, 17)