In [53]:
import re
import pandas as pd
import numpy as np
import nltk
from time import time
import seaborn as sns

In [2]:
df_all = pd.read_pickle('./data/df_all_lemma.pkl')
print(len(df_all))
df_all.head(1)

2524


Unnamed: 0,comments,description,duration,event,film_date,languages,link,main_speaker,name,num_speaker,published_date,ratings,related_talks,speaker_occupation,tags,title,transcript,url,views
0,4553.0,Sir Ken Robinson makes an entertaining and pro...,1164.0,TED2006,1140826000.0,60.0,,Ken Robinson,Ken Robinson: Do schools kill creativity?,1.0,1151367000.0,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...","[{'id': 865, 'hero': 'https://pe.tedcdn.com/im...",Author/educator,"['children', 'creativity', 'culture', 'dance',...",Do schools kill creativity?,it been great hasnt it. ive been blown away by...,https://www.ted.com/talks/ken_robinson_says_sc...,47227110.0


In [55]:
docs_all = list(df_all['transcript'])

from nltk.corpus import stopwords

stops_standard = stopwords.words('english')
stops_custom = ['shes','youll','ill','yeah','th','yes','oh',
                'ok','okay','might','ha','mr','bg','ms',
                'mrs','ca','em','da','ted','pm','hey','al']+[re.sub('[^A-Za-z ]+', '', w) for w in stops_standard]
stop_list = list(set(stops_standard + stops_custom))

# print(stop_list)

In [4]:
documents = docs_all

# The type of input that Word2Vec is looking for.. 
# is a list of sentences
# and, each sentence is a list of words
sentences = []
for document in documents:
    texts = [[word for word in sent.split() if word not in stop_list]
         for sent in document.split('.')]
    sentences += texts

In [5]:
sentences[:6]

[['great'],
 ['ive', 'blown', 'away', 'whole', 'thing'],
 ['fact', 'im', 'leaving'],
 ['three', 'theme', 'running', 'conference', 'relevant', 'want', 'talk'],
 ['one',
  'extraordinary',
  'evidence',
  'human',
  'creativity',
  'presentation',
  'weve',
  'people'],
 ['variety', 'range']]

In [6]:
import gensim  # using skip-gram
t0 = time()
model_TED = gensim.models.Word2Vec(sentences, size=50, window=5, min_count=1, workers=2,sg=1)
print("done in %0.3fs." % (time() - t0))

done in 29.905s.


In [9]:
model = model_TED

In [56]:
list(model.wv.vocab.items())[:5]

[('great', <gensim.models.keyedvectors.Vocab at 0x119038390>),
 ('ive', <gensim.models.keyedvectors.Vocab at 0x1190385c0>),
 ('blown', <gensim.models.keyedvectors.Vocab at 0x119abcf60>),
 ('away', <gensim.models.keyedvectors.Vocab at 0x119bb8f28>),
 ('whole', <gensim.models.keyedvectors.Vocab at 0x119c06630>)]

In [54]:
model.wv.most_similar('music', topn=10)

[('musical', 0.8442516326904297),
 ('violin', 0.8293101787567139),
 ('classical', 0.8263627886772156),
 ('beethoven', 0.8191320896148682),
 ('poetry', 0.8143652081489563),
 ('musician', 0.814338207244873),
 ('jazz', 0.8098499774932861),
 ('symphony', 0.8035442233085632),
 ('rap', 0.8035263419151306),
 ('improvise', 0.8016418814659119)]

In [13]:
model.wv.similarity('cancer','patient')

0.7551180308241303

### Define coherence score

In [None]:
from sklearn.metrics.pairwise import cosine_similarity


In [33]:
def coherence_score(words, model):
    X = []
    for word in words:
        X.append(model.wv.__getitem__(word))
    score = cosine_similarity(X, Y=None, dense_output=True)
    return score.mean()

In [40]:
def add_coherence_score(df, model):
    scores = []
    for i in range(len(df)):
        words = list(df.iloc[i,:])
        scores.append(coherence_score(words, model))
    df['score'] = scores
    return df

### Load top words in LSA

In [79]:
df = pd.read_pickle('./data/LSA_top_words.pkl')
df = add_coherence_score(df, model)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,score
0,city,brain,data,water,community,design,book,company,space,computer,0.526897
1,brain,cell,animal,planet,water,cancer,earth,robot,universe,light,0.566468
2,city,water,planet,energy,ocean,earth,climate,space,oil,sea,0.616479
3,cancer,cell,patient,disease,health,drug,government,africa,dollar,data,0.578982
4,data,computer,robot,information,machine,company,design,internet,government,phone,0.63456


In [80]:
df.score.mean()

0.5125746852159501

In [82]:
print(df[df.score==df.score.sort_values().iloc[0]])
print(df[df.score==df.score.sort_values().iloc[42]])
print(df[df.score==df.score.sort_values().iloc[49]])

        0        1        2    3     4       5      6         7      8    9  \
48  space  refugee  project  oil  song  choice  water  election  voice  fly   

       score  
48  0.455973  
      0       1       2        3     4    5             6       7  \
5  city  design  cancer  patient  cell  car  architecture  street   

              8      9     score  
5  neighborhood  space  0.565399  
      0         1      2            3        4        5       6         7  \
4  data  computer  robot  information  machine  company  design  internet   

            8      9    score  
4  government  phone  0.63456  


### Load top words in LDA

In [69]:
df = pd.read_pickle('./data/LDA_top_words.pkl')
df = add_coherence_score(df, model)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,score
0,limb,animal,force,muscle,spring,frame,per,video,movement,surface,0.527451
1,energy,car,oil,climate,product,water,fuel,carbon,city,per,0.591174
2,south,community,city,environmental,development,waste,economic,common,park,dog,0.544578
3,baby,mother,pregnant,mom,birth,milk,awesome,born,amazing,box,0.619164
4,united,men,paper,black,data,per,statistic,everybody,india,nice,0.515619


In [70]:
df.score.mean()

0.6049833065271377

In [71]:
print(df[df.score==df.score.sort_values().iloc[0]])
print(df[df.score==df.score.sort_values().iloc[25]])
print(df[df.score==df.score.sort_values().iloc[49]])

         0          1     2      3      4         5        6     7    8     9  \
44  design  happiness  gene  force  happy  designer  project  code  war  york   

       score  
44  0.484899  
           0      1       2           3           4      5        6       7  \
37  disorder  brain  device  disability  electrical  light  circuit  neuron   

       8       9     score  
37  gold  signal  0.598363  
        0        1        2         3        4        5           6    7  \
25  virus  disease  vaccine  mosquito  microbe  malaria  antibiotic  hiv   

            8         9     score  
25  infection  organism  0.778051  


### Load top words in NMF

In [75]:
df = pd.read_pickle('./data/NMF_top_words.pkl')
df = add_coherence_score(df, model)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,score
0,book,felt,learned,night,read,somebody,everybody,age,feeling,name,0.559617
1,dna,gene,genome,genetic,bacteria,virus,specie,sequence,molecule,organism,0.766341
2,energy,nuclear,climate,fuel,solar,carbon,electricity,coal,emission,co,0.744357
3,drug,disease,vaccine,virus,hiv,treatment,epidemic,trial,flu,health,0.738638
4,computer,software,code,algorithm,device,digital,program,learning,interface,mit,0.721945


In [76]:
df.score.mean()

0.6792611867189408

In [78]:
print(df[df.score==df.score.sort_values().iloc[0]])
print(df[df.score==df.score.sort_values().iloc[25]])
print(df[df.score==df.score.sort_values().iloc[49]])

       0      1    2        3    4     5         6      7        8        9  \
48  song  cause  eye  nervous  dog  lady  feedback  stage  freedom  brother   

       score  
48  0.497786  
     0        1         2         3            4       5         6      7  \
9  war  soldier  violence  conflict  afghanistan  weapon  military  peace   

      8       9     score  
9  iraq  killed  0.685777  
       0    1       2         3       4       5       6        7        8  \
12  girl  boy  father  daughter  parent  sister  mother  village  brother   

      9     score  
12  mom  0.802257  


* How to define the distances between documents based on the distance in Word2Vec?

In [None]:
# Compute distance of documents based on Word2Vec distance

In [85]:
print (model.wv.__getitem__('climate'))

[-0.08427628  0.41684142  0.1274766   0.7023802  -0.24573441 -0.98860013
 -0.19132812 -1.340887    0.7664083  -0.2512282  -0.43190917 -0.34973925
  0.60535854  0.5943859   0.08217657  1.4236348   0.32969564 -0.33111274
  0.6941277  -0.36479068 -0.679587    0.03300359 -1.0456446  -0.039244
  0.01228642 -0.01916472 -0.02262634  0.2942768   1.0078477  -0.03259582
 -0.20192716 -0.226965    0.5375818  -0.2982899   0.5816015   0.61463493
 -0.62001    -0.27787384 -0.19369034  0.3244977   0.64259297 -0.32401383
  0.32126945  0.04915848  0.62995416 -0.3642654   0.4645366   0.01334787
  0.44673425 -0.36006624]


In [93]:
model.most_similar('climate', topn=30)

  """Entry point for launching an IPython kernel.


[('warming', 0.8560107350349426),
 ('catastrophe', 0.824532151222229),
 ('desertification', 0.8010308742523193),
 ('overfishing', 0.8002601861953735),
 ('disruption', 0.7989152669906616),
 ('mitigate', 0.7943692803382874),
 ('twodegree', 0.7937451601028442),
 ('intergovernmental', 0.7863591313362122),
 ('deforestation', 0.7849549651145935),
 ('urbanization', 0.7847277522087097),
 ('acidification', 0.7834381461143494),
 ('environmental', 0.7807230949401855),
 ('polarization', 0.7777093052864075),
 ('unsustainable', 0.7770941257476807),
 ('inequity', 0.7729911804199219),
 ('catastrophic', 0.7727354764938354),
 ('gradual', 0.7710027694702148),
 ('drought', 0.7710023522377014),
 ('global', 0.7686222791671753),
 ('gridlock', 0.7665075063705444),
 ('degradation', 0.7637777924537659),
 ('crisis', 0.762445867061615),
 ('governance', 0.7578604221343994),
 ('lowcarbon', 0.7562432289123535),
 ('hunger', 0.7543408274650574),
 ('brink', 0.7516579627990723),
 ('systemic', 0.7506305575370789),
 ('dra

In [87]:
tf_vectorizer.get_feature_names()

['absolute',
 'abstract',
 'abuse',
 'academic',
 'accept',
 'accident',
 'according',
 'account',
 'accurate',
 'achieve',
 'acting',
 'active',
 'activist',
 'activity',
 'actor',
 'actual',
 'ad',
 'adam',
 'adapt',
 'add',
 'added',
 'addition',
 'address',
 'admit',
 'adult',
 'advance',
 'advanced',
 'advantage',
 'advice',
 'affect',
 'affected',
 'afford',
 'afghanistan',
 'afraid',
 'africa',
 'african',
 'afternoon',
 'agency',
 'agent',
 'aging',
 'agree',
 'agreed',
 'agriculture',
 'ah',
 'ahead',
 'aid',
 'aim',
 'airplane',
 'airport',
 'al',
 'algorithm',
 'alien',
 'alive',
 'allowed',
 'allowing',
 'allows',
 'alternative',
 'although',
 'alzheimers',
 'amazon',
 'among',
 'analysis',
 'ancestor',
 'ancient',
 'anderson',
 'angeles',
 'anger',
 'angle',
 'angry',
 'ant',
 'antibiotic',
 'anybody',
 'anymore',
 'anyway',
 'anywhere',
 'apart',
 'apartment',
 'app',
 'apparently',
 'appear',
 'apple',
 'application',
 'applied',
 'apply',
 'appreciate',
 'approach',
 'a

In [94]:
len(X)

2467

In [100]:
# a function that turns a document (in vectorized format) into the vectors of its words
def doc2words(document_sparse, feature_names, word2vec_model):
    X = document_sparse.toarray()
    features = vectorizer.get_feature_names
    for i in range(len(X)):
        
    model.wv.__getitem__('climate')

IndentationError: expected an indented block (<ipython-input-100-69e35ba8d15b>, line 7)

In [106]:
document_sparse = tf # tf or tfidf
word2vec_model = model_TED # model_TED or model_google_100 etc.
feature_names = tf_vectorizer.get_feature_names()
# doc2words(document_sparse, feature_names, word2vec_model)

In [108]:
doc_vec = tf.todense()[1]

In [113]:
feature_vec = model.wv.__getitem__(feature_names)
feature_vec.shape

(2000, 50)

In [109]:
new_vec = doc_vec.dot(feature_vec)

In [110]:
new_vec.shape

(1, 50)

In [111]:
new_vec

matrix([[ -17.39162118,  -75.65193581,   63.26797668,   81.26093022,
          -14.32507966, -166.41762158,  -52.56966448, -239.29081626,
           91.46040914,  -66.4453373 ,  -80.00524382,   46.70433163,
           82.05821881,   -3.50390634,  -13.78361464,  180.81476691,
           25.97947541,    6.34493195,  157.68742285,   59.36388117,
          -45.33857208,  112.53168071,  -40.97641475,   13.97636647,
           90.35267034,  -13.9588852 ,   86.00223244,  -24.00281094,
           90.28083895,   15.90018967,  -63.94696609,   42.33036307,
          -32.48594836,    6.64506606,   19.80055169,  -75.42120159,
          -79.27954104,  -80.04876735,  -79.94421258,   42.64000593,
          116.24749941,  -29.15994209,   27.51159701,  -47.38383486,
           81.54624244,  -28.61150743,   -2.49436879,   -9.64589779,
           81.32097096,  -31.64717222]])

In [None]:
N = 1000

document_sparse = tfidf
word2vec_model = model_TED # model_TED or model_google_100 etc.
feature_names = tf_vectorizer.get_feature_names()

doc_vec = tf.todense()[N]
feature_vec = word2vec_model.wv.__getitem__(feature_names)



np.savetxt('./data/lsa_data.tsv', X, delimiter='\t',newline='\n')

Xlabel = list(df_all.title)
with open('./data/lsa_meta.tsv', 'w') as file:
    for label in Xlabel:
        file.write(label+'\n')

In [119]:
print(tfidf[0,:])

  (0, 1024)	0.07741893711812595
  (0, 1802)	0.04418942619454977
  (0, 352)	0.07598581169337414
  (0, 1475)	0.04471890751802793
  (0, 665)	0.14569010010696387
  (0, 628)	0.06805210986559387
  (0, 410)	0.20827308839668318
  (0, 1366)	0.09112643658288003
  (0, 1905)	0.04179295610060574
  (0, 1439)	0.036596253619872995
  (0, 944)	0.13073235562824667
  (0, 496)	0.07929216052914853
  (0, 1273)	0.07094541468937776
  (0, 760)	0.136110057595616
  (0, 1705)	0.03604121251267758
  (0, 169)	0.03481904146544406
  (0, 452)	0.03176483861689968
  (0, 1476)	0.04118345584770355
  (0, 1106)	0.06586559769276992
  (0, 478)	0.03587581807373999
  (0, 41)	0.045563218291440014
  (0, 226)	0.2310001089933845
  (0, 925)	0.03612483431716769
  (0, 270)	0.04079479481235949
  (0, 1768)	0.13727942104348764
  :	:
  (0, 1937)	0.04023596750144571
  (0, 1614)	0.0390311730414011
  (0, 1930)	0.03591693837602586
  (0, 1627)	0.03299610562380522
  (0, 1771)	0.04770404376566258
  (0, 1144)	0.03277607923159967
  (0, 232)	0.035472