In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import pickle
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from nltk.probability import FreqDist

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import NMF
from sklearn.decomposition import PCA

from sklearn.manifold import TSNE

from sklearn.pipeline import Pipeline

from gensim import corpora, models, similarities, matutils

import re
import string

from wordcloud import WordCloud, STOPWORDS

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors



# call glove2word2vec script
# default way (through CLI): python -m gensim.scripts.glove2word2vec --input <glove_file> --output <w2v_file>
from gensim.scripts.glove2word2vec import glove2word2vec

In [2]:
df_merged = pd.read_pickle("./data/df_merged_cleaned.pkl")

### Tokenization

In [3]:
stop_words = set(stopwords.words('english'))

In [4]:
df_merged['tokenized_text'] = df_merged['text'].apply(word_tokenize)

## Lemmatization

In [5]:
lemmatizer=WordNetLemmatizer()


df_merged['lemmatized_text'] = df_merged['tokenized_text'].apply(lambda x: [lemmatizer.lemmatize(y) for y in x])

In [8]:
vectorizer_tfidf = TfidfVectorizer(stop_words="english",
                        use_idf=True,
                        ngram_range=(1,1), 
                        min_df = 0.05,     
                        max_df = 0.35)

In [10]:
df_merged['lemma_text_string'] = df_merged['lemmatized_text'].apply(', '.join)

In [11]:
tfidf = vectorizer_tfidf.fit_transform(df_merged['lemma_text_string'])

In [12]:
number_topics = 20
lda = LatentDirichletAllocation(n_components = number_topics,random_state = 0)

topics = lda.fit_transform(tfidf)
top_n_words = 10
t_words, word_strengths = {}, {}
for t_id, t in enumerate(lda.components_):
    t_words[t_id] = [vectorizer_tfidf.get_feature_names()[i] for i in t.argsort()[:-top_n_words - 1:-1]]
    word_strengths[t_id] = t[t.argsort()[:-top_n_words - 1:-1]]
t_words

{0: ['man',
  'tear',
  'song',
  'awesome',
  'robot',
  'capture',
  'dance',
  'cause',
  'raise',
  'rain'],
 1: ['man',
  'tear',
  'song',
  'awesome',
  'robot',
  'capture',
  'dance',
  'cause',
  'raise',
  'rain'],
 2: ['man',
  'tear',
  'song',
  'awesome',
  'robot',
  'capture',
  'dance',
  'cause',
  'raise',
  'rain'],
 3: ['man',
  'tear',
  'song',
  'awesome',
  'robot',
  'capture',
  'dance',
  'cause',
  'raise',
  'rain'],
 4: ['man',
  'tear',
  'song',
  'awesome',
  'robot',
  'capture',
  'dance',
  'cause',
  'raise',
  'rain'],
 5: ['man',
  'tear',
  'song',
  'awesome',
  'robot',
  'capture',
  'dance',
  'cause',
  'raise',
  'rain'],
 6: ['man',
  'tear',
  'song',
  'awesome',
  'robot',
  'capture',
  'dance',
  'cause',
  'raise',
  'rain'],
 7: ['woman',
  'brain',
  'city',
  'kid',
  'data',
  'water',
  'design',
  'family',
  'community',
  'information'],
 8: ['man',
  'tear',
  'song',
  'awesome',
  'robot',
  'capture',
  'dance',
  'caus

In [13]:
number_topics = 20
pca = PCA(n_components = number_topics,random_state = 0 )

topics = pca.fit_transform(np.array(pd.DataFrame(tfidf.toarray())))
top_n_words = 10
t_words, word_strengths = {}, {}
for t_id, t in enumerate(pca.components_):
    t_words[t_id] = [vectorizer_tfidf.get_feature_names()[i] for i in t.argsort()[:-top_n_words - 1:-1]]
    word_strengths[t_id] = t[t.argsort()[:-top_n_words - 1:-1]]
t_words

{0: ['woman',
  'men',
  'girl',
  'family',
  'mother',
  'kid',
  'man',
  'young',
  'boy',
  'father'],
 1: ['brain',
  'cell',
  'cancer',
  'patient',
  'woman',
  'disease',
  'body',
  'drug',
  'gene',
  'doctor'],
 2: ['cancer',
  'cell',
  'patient',
  'disease',
  'health',
  'drug',
  'government',
  'africa',
  'dollar',
  'food'],
 3: ['woman',
  'water',
  'ocean',
  'planet',
  'earth',
  'animal',
  'sea',
  'men',
  'specie',
  'ice'],
 4: ['city',
  'building',
  'design',
  'cancer',
  'cell',
  'patient',
  'architecture',
  'community',
  'car',
  'neighborhood'],
 5: ['brain',
  'city',
  'woman',
  'men',
  'building',
  'space',
  'image',
  'region',
  'pattern',
  'architecture'],
 6: ['woman',
  'robot',
  'data',
  'men',
  'computer',
  'machine',
  'cell',
  'design',
  'cancer',
  'universe'],
 7: ['robot',
  'kid',
  'student',
  'food',
  'brain',
  'water',
  'animal',
  'machine',
  'teacher',
  'woman'],
 8: ['robot',
  'music',
  'city',
  'war',


In [14]:

number_topics = 20
nmf = NMF(n_components = number_topics,random_state = 0)

topics = nmf.fit_transform(tfidf)
top_n_words = 10
t_words, word_strengths = {}, {}
for t_id, t in enumerate(nmf.components_):
    t_words[t_id] = [vectorizer_tfidf.get_feature_names()[i] for i in t.argsort()[:-top_n_words - 1:-1]]
    word_strengths[t_id] = t[t.argsort()[:-top_n_words - 1:-1]]
t_words

{0: ['god',
  'man',
  'family',
  'father',
  'mother',
  'oh',
  'book',
  'felt',
  'night',
  'yeah'],
 1: ['woman',
  'men',
  'girl',
  'sex',
  'gender',
  'female',
  'boy',
  'male',
  'mother',
  'violence'],
 2: ['cell',
  'dna',
  'gene',
  'cancer',
  'molecule',
  'body',
  'drug',
  'protein',
  'organ',
  'blood'],
 3: ['company',
  'business',
  'dollar',
  'money',
  'market',
  'economy',
  'cost',
  'product',
  'value',
  'economic'],
 4: ['ocean',
  'fish',
  'sea',
  'animal',
  'ice',
  'specie',
  'island',
  'boat',
  'deep',
  'surface'],
 5: ['design',
  'building',
  'designer',
  'architecture',
  'material',
  'project',
  'architect',
  'designed',
  'space',
  'structure'],
 6: ['brain',
  'behavior',
  'memory',
  'body',
  'activity',
  'pattern',
  'signal',
  'arm',
  'study',
  'mental'],
 7: ['data',
  'information',
  'computer',
  'machine',
  'internet',
  'web',
  'phone',
  'algorithm',
  'digital',
  'online'],
 8: ['robot',
  'machine',
  '

In [19]:
pipe = Pipeline([
    ('tfidf', vectorizer_tfidf),
    ('nmf', nmf)
])

In [21]:
df_topics = pipe.transform(df_merged['lemma_text_string']) 
df_topics = pd.DataFrame(df_topics, columns=[str(t_words[i]) for i in range(0,number_topics)])
df_topics.head(50)

Unnamed: 0,"['god', 'man', 'family', 'father', 'mother', 'oh', 'book', 'felt', 'night', 'yeah']","['woman', 'men', 'girl', 'sex', 'gender', 'female', 'boy', 'male', 'mother', 'violence']","['cell', 'dna', 'gene', 'cancer', 'molecule', 'body', 'drug', 'protein', 'organ', 'blood']","['company', 'business', 'dollar', 'money', 'market', 'economy', 'cost', 'product', 'value', 'economic']","['ocean', 'fish', 'sea', 'animal', 'ice', 'specie', 'island', 'boat', 'deep', 'surface']","['design', 'building', 'designer', 'architecture', 'material', 'project', 'architect', 'designed', 'space', 'structure']","['brain', 'behavior', 'memory', 'body', 'activity', 'pattern', 'signal', 'arm', 'study', 'mental']","['data', 'information', 'computer', 'machine', 'internet', 'web', 'phone', 'algorithm', 'digital', 'online']","['robot', 'machine', 'leg', 'video', 'body', 'intelligence', 'computer', 'animal', 'artificial', 'build']","['government', 'war', 'democracy', 'political', 'society', 'american', 'election', 'conflict', 'group', 'social']","['patient', 'cancer', 'disease', 'health', 'doctor', 'drug', 'medical', 'care', 'treatment', 'hospital']","['universe', 'planet', 'earth', 'star', 'space', 'light', 'sun', 'solar', 'science', 'billion']","['music', 'sound', 'play', 'song', 'hear', 'instrument', 'playing', 'video', 'piece', 'game']","['kid', 'teacher', 'student', 'education', 'learning', 'classroom', 'class', 'teach', 'teaching', 'learn']","['city', 'building', 'street', 'community', 'neighborhood', 'urban', 'space', 'map', 'york', 'park']","['africa', 'african', 'aid', 'continent', 'south', 'poverty', 'india', 'family', 'china', 'leader']","['plant', 'food', 'forest', 'specie', 'animal', 'tree', 'nature', 'eat', 'gene', 'farmer']","['water', 'energy', 'oil', 'climate', 'fuel', 'material', 'ice', 'river', 'waste', 'solar']","['art', 'image', 'artist', 'painting', 'book', 'film', 'museum', 'object', 'project', 'camera']","['car', 'ca', 'vehicle', 'mile', 'driver', 'driving', 'road', 'drive', 'traffic', 'yeah']"
0,0.043847,0.041454,0.0,0.005237,0.0,0.0,0.019192,0.0,0.0,0.0,0.0,0.018818,0.030956,0.117025,0.0,0.001936,0.009568,8.8e-05,0.048016,0.0
1,0.02645,0.0,0.0,0.062064,0.005586,0.023594,0.0,0.0,0.0,0.035505,0.0,0.0,0.004789,0.0,0.015882,0.009697,0.015714,0.051441,0.006068,0.084008
2,0.044188,0.0,0.010444,0.050816,0.0,0.06126,0.0,0.055219,0.000235,0.0,0.0,0.0,0.018117,0.0,0.0,0.0,0.0,0.0,0.004695,0.025782
3,0.009847,0.010055,0.0,0.04819,0.0,0.015242,0.0,0.0,0.0,0.029085,0.009656,0.0,0.0,0.018456,0.153865,0.030069,0.056835,0.051259,0.001957,0.016891
4,0.0,0.009682,0.0,0.034406,0.0,0.0,0.0,0.108806,0.0,0.000855,0.019971,0.0,0.0,0.016529,0.000631,0.26977,0.0,0.0,0.0,0.0
5,0.068339,0.021258,0.001942,0.047076,0.0,0.002904,0.024677,0.017975,0.000798,0.011896,0.002759,0.0,0.012128,0.004368,0.001384,0.002244,0.014573,0.0,0.019245,0.002724
6,0.124439,0.054101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022187,0.0,0.0
7,0.0,0.0,0.0,0.017774,0.0,0.147667,0.0,0.0,0.0,0.009163,0.0,0.007257,0.0,0.005149,0.049743,0.0,0.0,0.002065,0.102495,0.0
8,0.046722,0.0,0.003199,0.0,0.0,0.064384,0.037011,0.0,0.0,0.03746,0.0,0.038178,0.005892,0.0,0.0,0.0,0.022598,0.0,0.010445,0.0
9,0.073558,0.0,0.0,0.045047,0.0,0.0,0.000326,0.0,0.0,0.007961,0.00195,0.011264,0.0,0.014879,0.0,0.008434,0.001864,0.0,0.027345,0.0


In [22]:
best_topics = df_topics.idxmax(axis=1)
best_topics

0       ['kid', 'teacher', 'student', 'education', 'le...
1       ['car', 'ca', 'vehicle', 'mile', 'driver', 'dr...
2       ['design', 'building', 'designer', 'architectu...
3       ['city', 'building', 'street', 'community', 'n...
4       ['africa', 'african', 'aid', 'continent', 'sou...
5       ['god', 'man', 'family', 'father', 'mother', '...
6       ['god', 'man', 'family', 'father', 'mother', '...
7       ['design', 'building', 'designer', 'architectu...
8       ['design', 'building', 'designer', 'architectu...
9       ['god', 'man', 'family', 'father', 'mother', '...
10      ['design', 'building', 'designer', 'architectu...
11      ['art', 'image', 'artist', 'painting', 'book',...
12      ['patient', 'cancer', 'disease', 'health', 'do...
13      ['data', 'information', 'computer', 'machine',...
14      ['kid', 'teacher', 'student', 'education', 'le...
15      ['design', 'building', 'designer', 'architectu...
16      ['music', 'sound', 'play', 'song', 'hear', 'in...
17      ['wate

In [23]:
best_topics = pd.DataFrame(best_topics)

In [24]:
best_topics.head(3)

Unnamed: 0,0
0,"['kid', 'teacher', 'student', 'education', 'le..."
1,"['car', 'ca', 'vehicle', 'mile', 'driver', 'dr..."
2,"['design', 'building', 'designer', 'architectu..."


In [26]:
# best_topics[0] = best_topics[0].map(punc_lower).map(alphanumeric)

In [None]:
# best_topics[0] = best_topics[0].apply(lambda x: x.split(' '))

In [27]:
# best_topics.head(5)

In [28]:
# best_topics[0] = best_topics[0].apply(lambda x: [s.strip() for s in x])

In [29]:
# model[best_topics[0][0]]

In [None]:
l = list( range(0, 2467) )

In [None]:
hello = list()
for i in l:
    hello.extend(np.mean(model[best_topics[0][i]], axis=0) )
    
hello = np.array(hello)
hello = hello.reshape((2467, 300))

In [None]:
hello= pd.DataFrame(hello)

In [None]:
hello.iloc[0].ravel()

In [None]:
model.similar_by_vector(hello.iloc[0].ravel())

In [None]:
model.vectors

In [None]:
df_topics_token = pipe.transform(df_merged['tokenized_text']) 
df_topics_token = pd.DataFrame(df_topics_token, columns=[str(t_words[i]) for i in range(0,10)])
df_topics_token.head()