***

# Word embedding in Python Gensim

#### based on: https://machinelearningmastery.com/develop-word-embeddings-python-gensim/

***

In [1]:
import os
path = 'C:/Users/renswilderom/Documents/Machine learning'
os.chdir(path)

In [2]:
import os 
import pandas as pd

df = pd.read_excel("C:/Users/renswilderom/Documents/Dan Silver Projects/history of sociology/stm_prep.xlsx") 

In [3]:
df.loc[:, 'TEXT'] = df.loc[:, 'TEXT'].astype(str)

In [4]:
# Split documents into sentences
import nltk
tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
df['sentences'] = df.apply(lambda row: tokenizer.tokenize(row['TEXT']), axis=1)

In [5]:
df.loc[:, 'sentences'] = df.loc[:, 'sentences'].astype(str)

In [6]:
# print(df.loc[1, 'sentences'])

In [7]:
# Tokenize sentences
df['tokenized_sentences'] = df.apply(lambda row: nltk.word_tokenize(row['sentences']), axis=1)

In [8]:
# print(df.loc[1, 'tokenized_sentences'])

***

## Slice data frame in periods

***

In [9]:
df_temp = df

In [10]:
df1 = df_temp[df_temp.YEAR < 1926] # 1895-1925

In [11]:
df1.shape

(977, 6)

In [12]:
df2 = df_temp[(df_temp.YEAR > 1925) & (df_temp.YEAR < 1956)] # 1926-1955

In [13]:
df2.shape

(2708, 6)

In [14]:
df3 = df_temp[(df_temp.YEAR > 1955) & (df_temp.YEAR < 1986)] # 1956-1985

In [15]:
df3.shape

(3934, 6)

In [16]:
df4 = df_temp[(df_temp.YEAR > 1985)] # 1986-2015

In [17]:
df4.shape

(6119, 6)

***

## Train models

***

In [18]:
from gensim.models import Word2Vec
# train model
model1 = Word2Vec(df1.tokenized_sentences, min_count=1) # 1895-1925



In [19]:
model2 = Word2Vec(df2.tokenized_sentences, min_count=1) # 1926-1955

In [20]:
model3 = Word2Vec(df3.tokenized_sentences, min_count=1) # 1956-1985

In [21]:
model4 = Word2Vec(df4.tokenized_sentences, min_count=1) # 1986-2015

In [22]:
model = model4

***

## 10 most similar words vis-a-vis some words of interest

***

**relation**

In [23]:
model1.wv.most_similar('relation') # 1895-1925

[('relationship', 0.8659447431564331),
 ('relations', 0.7774339914321899),
 ('response', 0.7536336183547974),
 ('function', 0.741924524307251),
 ('opposition', 0.722886323928833),
 ('adjustment', 0.7080439925193787),
 ('reference', 0.7015237808227539),
 ('distinction', 0.6922434568405151),
 ('status', 0.6879868507385254),
 ('tendency', 0.685497522354126)]

In [24]:
model2.wv.most_similar('relation') # 1926-1955

[('relationship', 0.8475290536880493),
 ('resemblance', 0.7025606632232666),
 ('relations', 0.681898832321167),
 ('relationships', 0.6548746824264526),
 ('association', 0.6409438252449036),
 ('similarity', 0.6373202204704285),
 ('interrelations', 0.6372702717781067),
 ('nexus', 0.6235227584838867),
 ('respect', 0.6112664937973022),
 ('contrast', 0.6106635332107544)]

In [25]:
model3.wv.most_similar('relation') # 1956-1985

[('relationship', 0.8870164155960083),
 ('link', 0.748445987701416),
 ('relationships', 0.7303850650787354),
 ('connection', 0.718569278717041),
 ('association', 0.7144267559051514),
 ('linkage', 0.712634801864624),
 ('tie', 0.7036232948303223),
 ('relations', 0.6887329816818237),
 ('nexus', 0.6794794201850891),
 ('links', 0.6668810248374939)]

In [26]:
model4.wv.most_similar('relation') # 1986-2015

[('relationship', 0.8372057676315308),
 ('connection', 0.7463864088058472),
 ('link', 0.6992675065994263),
 ('linkage', 0.6870735883712769),
 ('interplay', 0.676097571849823),
 ('relations', 0.6587059497833252),
 ('interrelations', 0.6364309787750244),
 ('relationships', 0.6203354597091675),
 ('polarity', 0.6110794544219971),
 ('difference', 0.6108248829841614)]

**cultural**

In [27]:
model1.wv.most_similar('cultural')

[('racial', 0.839154839515686),
 ('psychical', 0.8305606245994568),
 ('functional', 0.8304929733276367),
 ('psychic', 0.8282445669174194),
 ('chemical', 0.8225065469741821),
 ('geographical', 0.8072457909584045),
 ('secondary', 0.8065112829208374),
 ('societary', 0.8044930696487427),
 ('physiological', 0.7992188930511475),
 ('internal', 0.7977006435394287)]

In [28]:
model2.wv.most_similar('cultural')

[('culture', 0.8169986009597778),
 ('structural', 0.7952417135238647),
 ('biological', 0.7710198760032654),
 ('societal', 0.7623051404953003),
 ('functional', 0.7543084621429443),
 ('institutional', 0.7470966577529907),
 ('social', 0.7324671149253845),
 ('psychological', 0.7257303595542908),
 ('psychic', 0.717496395111084),
 ('dynamic', 0.7122875452041626)]

In [29]:
model3.wv.most_similar('cultural')

[('linguistic', 0.7234350442886353),
 ('ideological', 0.7230170965194702),
 ('cognitive', 0.7189174890518188),
 ('institutional', 0.7158393859863281),
 ('secular', 0.7142602205276489),
 ('subcultural', 0.7104274034500122),
 ('distinctive', 0.7072334289550781),
 ('religious', 0.7016386985778809),
 ('behavioral', 0.688870370388031),
 ('normative', 0.6853549480438232)]

In [30]:
model4.wv.most_similar('cultural')

[('symbolic', 0.7510474920272827),
 ('linguistic', 0.6993869543075562),
 ('discursive', 0.6973252296447754),
 ('artistic', 0.6774075627326965),
 ('institutional', 0.6765850186347961),
 ('ideological', 0.6753431558609009),
 ('sociocultural', 0.6697101593017578),
 ('political', 0.664158821105957),
 ('ideational', 0.6546635627746582),
 ('aesthetic', 0.6495617032051086)]

## For Dan: 
#### ‘objective’, ‘objectivity’, ‘subjective’, ‘subjectivity’, ‘schema’, ‘Simmel’

**objective**

In [31]:
model1.wv.most_similar('objective')

[('psychological', 0.8524646162986755),
 ('abstract', 0.8492943048477173),
 ('subjective', 0.829910159111023),
 ('positive', 0.8279579877853394),
 ('logical', 0.8264224529266357),
 ('ultimate', 0.823868989944458),
 ('organic', 0.8155571222305298),
 ('biological', 0.8127901554107666),
 ('psychic', 0.803335964679718),
 ('concrete', 0.8004611134529114)]

In [32]:
model2.wv.most_similar('objective')

[('operational', 0.7728350758552551),
 ('meaningful', 0.7698444128036499),
 ('abstract', 0.7688069939613342),
 ('subjective', 0.7643225789070129),
 ('empirical', 0.7606490850448608),
 ('quantitative', 0.7583636045455933),
 ('measurement', 0.7483901977539062),
 ('abstraction', 0.7343795299530029),
 ('analytical', 0.7333526611328125),
 ('evaluation', 0.7240404486656189)]

In [33]:
model3.wv.most_similar('objective')

[('object', 0.67438805103302),
 ('operational', 0.6578528881072998),
 ('essential', 0.654657244682312),
 ('cognitive', 0.6544842720031738),
 ('subjective', 0.6526729464530945),
 ('aggregation', 0.6505426168441772),
 ('motive', 0.6469446420669556),
 ('underlying', 0.6412361264228821),
 ('meaning', 0.6386818885803223),
 ('intrinsic', 0.6365792751312256)]

In [34]:
model4.wv.most_similar('objective')

[('intrinsic', 0.7122223377227783),
 ('immanent', 0.6869300603866577),
 ('subjective', 0.6860836744308472),
 ('ontological', 0.6432620286941528),
 ('underlying', 0.6426305174827576),
 ('ultimate', 0.6318342685699463),
 ('meaning', 0.6300592422485352),
 ('actual', 0.6297328472137451),
 ('existential', 0.6245933771133423),
 ('objectivity', 0.6208010315895081)]

**objectivity**

In [35]:
model1.wv.most_similar('objectivity')

[('coherence', 0.785571277141571),
 ('generality', 0.7423686981201172),
 ('contradiction', 0.7417454123497009),
 ('methodology', 0.7412897348403931),
 ('retreat', 0.7376754879951477),
 ('design', 0.7308334112167358),
 ('substratum', 0.7266759872436523),
 ('background', 0.722693145275116),
 ('humanitarianism', 0.7202526330947876),
 ('novelty', 0.7150938510894775)]

In [36]:
model2.wv.most_similar('objectivity')

[('clarity', 0.77120041847229),
 ('comprehension', 0.7646514177322388),
 ('motivation', 0.7588655352592468),
 ('speculation', 0.7501348853111267),
 ('clarification', 0.7429815530776978),
 ('truth', 0.7427546977996826),
 ('logic', 0.7378693222999573),
 ('perception', 0.7306792736053467),
 ('imagination', 0.727269172668457),
 ('reasoning', 0.7232944965362549)]

In [37]:
model3.wv.most_similar('objectivity')

[('universality', 0.7333980202674866),
 ('truth', 0.7082447409629822),
 ('relativism', 0.7047163844108582),
 ('ethical', 0.704587459564209),
 ('clarity', 0.7023354172706604),
 ('canons', 0.6802035570144653),
 ('sophistication', 0.675750195980072),
 ('formalism', 0.6751270294189453),
 ('neutrality', 0.6708697080612183),
 ('rationality', 0.6708270907402039)]

In [38]:
model4.wv.most_similar('objectivity')

[('truth', 0.7801140546798706),
 ('universality', 0.7782642841339111),
 ('neutrality', 0.7599254250526428),
 ('irrationality', 0.7353039979934692),
 ('judgement', 0.7230526804924011),
 ('originality', 0.7225257754325867),
 ('transcendental', 0.7116068601608276),
 ('rationality', 0.7036386728286743),
 ('authenticity', 0.6976580023765564),
 ('metaphysical', 0.6971846222877502)]

**subjective**

In [39]:
model1.wv.most_similar('subjective')

[('psychic', 0.8744606971740723),
 ('psychical', 0.8654098510742188),
 ('biological', 0.8560588359832764),
 ('physiological', 0.8432759046554565),
 ('objective', 0.829910159111023),
 ('psychological', 0.821489155292511),
 ('functional', 0.8200857639312744),
 ('instinctive', 0.8038197159767151),
 ('genetic', 0.795669674873352),
 ('negative', 0.7941077351570129)]

In [40]:
model2.wv.most_similar('subjective')

[('verbal', 0.7737252712249756),
 ('conventional', 0.7685337066650391),
 ('objective', 0.7643226385116577),
 ('symbolic', 0.7632737159729004),
 ('evaluative', 0.7547131776809692),
 ('behavioral', 0.7420533299446106),
 ('particularistic', 0.7353549599647522),
 ('psychological', 0.7318572998046875),
 ('implicit', 0.7298088073730469),
 ('situational', 0.7247463464736938)]

In [41]:
model3.wv.most_similar('subjective')

[('cognitive', 0.7446319460868835),
 ('intrinsic', 0.6979210376739502),
 ('affective', 0.6902922987937927),
 ('verbal', 0.6819781064987183),
 ('behavioral', 0.6807016134262085),
 ('personality', 0.6753392219543457),
 ('psychological', 0.6708508729934692),
 ('objective', 0.6526729464530945),
 ('normative', 0.6408082246780396),
 ('symbolic', 0.6322594881057739)]

In [42]:
model4.wv.most_similar('subjective')

[('objective', 0.6860836744308472),
 ('situational', 0.6760436296463013),
 ('corporeal', 0.6721557974815369),
 ('affective', 0.671944797039032),
 ('dispositional', 0.6658008694648743),
 ('physiological', 0.6584948301315308),
 ('expressive', 0.6532297134399414),
 ('behavioral', 0.6517412066459656),
 ('cognitive', 0.6468936204910278),
 ('intentional', 0.6468185186386108)]

**subjectivity**

In [43]:
model1.wv.most_similar('subjectivity')
# the model picks up numbers, suggesting that subjectivity was not really used frequently in this early period

[("'Relief", 0.7211368083953857),
 ("'CHRISTIANITY", 0.7149657011032104),
 ('suddenness', 0.7115455269813538),
 ('assertive', 0.7082674503326416),
 ('Sébastien', 0.7077919840812683),
 ('Faure', 0.7071225643157959),
 ('modesty', 0.705947995185852),
 ('waterworks.', 0.7038592100143433),
 ('™Ibid', 0.7032727599143982),
 ('machineries', 0.7029246091842651)]

In [44]:
model2.wv.most_similar('subjectivity')

[('correctness', 0.7589266300201416),
 ('fruitfulness', 0.7455395460128784),
 ('verbalization', 0.7403524518013),
 ('universality', 0.7308475375175476),
 ('soundness', 0.7305565476417542),
 ('specificity', 0.7224956750869751),
 ('uniqueness', 0.7215976715087891),
 ('rationality', 0.7176700830459595),
 ('work.4', 0.7143925428390503),
 ('constancy', 0.711570143699646)]

In [45]:
model3.wv.most_similar('subjectivity')

[('sacredness', 0.7629703879356384),
 ('intersubjectivity', 0.7543718218803406),
 ('reification', 0.7342047691345215),
 ('sociality', 0.734102725982666),
 ('praxis', 0.7116429805755615),
 ('superego', 0.7090240716934204),
 ('irrationality', 0.7051604986190796),
 ('supernatural', 0.6933643817901611),
 ('immortality', 0.6916900277137756),
 ('hedonism', 0.6879349946975708)]

In [46]:
model4.wv.most_similar('subjectivity')

[('consciousness', 0.7812148332595825),
 ('self', 0.7622339725494385),
 ('intentionality', 0.7554757595062256),
 ('selfhood', 0.7524404525756836),
 ('sociality', 0.7517572641372681),
 ('reflexivity', 0.7450586557388306),
 ('individuality', 0.7324085235595703),
 ('agency', 0.727793276309967),
 ('cognition', 0.7256108522415161),
 ('intersubjectivity', 0.7250075936317444)]

In [47]:
model1.wv.most_similar('schema')

[('outcome', 0.7596882581710815),
 ('crisis', 0.7538934946060181),
 ('thesis', 0.7532879114151001),
 ('appearance', 0.749157190322876),
 ('mood', 0.7476211786270142),
 ('keynote', 0.7425433397293091),
 ('hypothesis', 0.738142728805542),
 ('transformation', 0.7305818200111389),
 ('defect', 0.7286218404769897),
 ('creed', 0.7250339984893799)]

**schema**

In [48]:
model2.wv.most_similar('schema')

[('means-end', 0.7441396713256836),
 ('typology', 0.734366774559021),
 ('causation', 0.7301516532897949),
 ('formulation', 0.7151468992233276),
 ('conceptualization', 0.7118353247642517),
 ('paradigm', 0.7072738409042358),
 ('substantive', 0.7031725645065308),
 ('systematization', 0.6971014738082886),
 ('causality', 0.693217933177948),
 ('clarification', 0.6926217675209045)]

In [49]:
model3.wv.most_similar('schema')

[('scheme', 0.7933559417724609),
 ('formulation', 0.7889211773872375),
 ('paradigm', 0.7764719128608704),
 ('typology', 0.7751098871231079),
 ('conceptualization', 0.7513638734817505),
 ('characterization', 0.7298194766044617),
 ('reformulation', 0.7205566763877869),
 ('epistemology', 0.7142667770385742),
 ('logic', 0.70758455991745),
 ('framework', 0.7057795524597168)]

In [50]:
model4.wv.most_similar('schema')

[('scheme', 0.8517293930053711),
 ('typology', 0.7872000932693481),
 ('framework', 0.7809886336326599),
 ('model', 0.7502861022949219),
 ('conceptualization', 0.7485644817352295),
 ('conception', 0.72993004322052),
 ('concept', 0.7214322090148926),
 ('formulation', 0.7199011445045471),
 ('definition', 0.7133165001869202),
 ('logic', 0.7117828130722046)]

**Simmel**

In [51]:
model1.wv.most_similar('Simmel')

[('Small', 0.8588085174560547),
 ('Durkheim', 0.839136004447937),
 ('Wundt', 0.83685302734375),
 ('Sumner', 0.8282672166824341),
 ('Ross', 0.8259565830230713),
 ('Frazer', 0.8251640796661377),
 ('Tarde', 0.8244202136993408),
 ('Ellwood', 0.8210508823394775),
 ('Loria', 0.8199872970581055),
 ('Schmoller', 0.8184661865234375)]

In [52]:
model2.wv.most_similar('Simmel')

[('Durkheim', 0.87535160779953),
 ('Karl', 0.868350088596344),
 ('Herbert', 0.8593208193778992),
 ('Sumner', 0.8556383848190308),
 ('Mead', 0.8555867671966553),
 ('Becker', 0.8554558753967285),
 ('Spencer', 0.8531584143638611),
 ('Marx', 0.8514550924301147),
 ('Lundberg', 0.8499455451965332),
 ('Pareto', 0.847312331199646)]

In [53]:
model3.wv.most_similar('Simmel')

[('Spencer', 0.8490694761276245),
 ('Mannheim', 0.8403051495552063),
 ('Freud', 0.8245453834533691),
 ('Mead', 0.8197968602180481),
 ('Malinowski', 0.818902850151062),
 ('Comte', 0.8144658803939819),
 ('Hegel', 0.8132918477058411),
 ('Pareto', 0.8084408640861511),
 ('Weber', 0.8053643703460693),
 ('Durkheim', 0.791892409324646)]

In [54]:
model4.wv.most_similar('Simmel')

[('Durkheim', 0.8430885076522827),
 ('Weber', 0.8373894691467285),
 ('Elias', 0.807611346244812),
 ('Castoriadis', 0.7955990433692932),
 ('Parsons', 0.7842784523963928),
 ('Whitehead', 0.7732088565826416),
 ('Mannheim', 0.7559753060340881),
 ('Nietzsche', 0.7544924020767212),
 ('Bergson', 0.7537936568260193),
 ('Mead', 0.7514632940292358)]

***

## End of script

***

***

## notes and draft

***

* TSNE vis https://www.kaggle.com/jeffd23/visualizing-word-vectors-with-t-sne
* TSNE vis http://lvdmaaten.github.io/tsne/
* Second Tensorboard visualization demo: https://towardsdatascience.com/training-and-visualising-word-vectors-2f946c6430f8
* Word to Vec explained: http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
* Using .loc: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
* Gensim Word2Vec: https://radimrehurek.com/gensim/models/word2vec.html
* SNA in Jupyter Notebook: http://bl.ocks.org/brinrosenthal/raw/fd7d7277ce74c2b762d3a4d66326215c/
* Same but than the blog version: http://compbio.ucsd.edu/bringing-interactivity-network-visualization-jupyter-notebooks-visjs2jupyter/
* Nice word embedding blog
* Presentation Manning: https://nlp.stanford.edu/manning/talks/Simons-Institute-Manning-2017.pdf
* what about stemming?

In [None]:
# After the model is trained, it is accessible via the “wv” attribute. This is the actual word vector model in which queries can be made.

# For example, you can print the learned vocabulary of tokens (words) as follows:

words = list(model_test.wv.vocab)
print(words)

In [None]:
# You can review the embedded vector for a specific token as follows:

print(model_test['system'])

In [None]:
model_test.wv.most_similar(positive=['system', 'social'], negative=['socialism'])

In [None]:
model_test.wv.doesnt_match("system connected developed science".split())

In [None]:
model_test.wv.similarity('causal', 'relation')

In [None]:
# When getting started, you can save the learned model in ASCII format and review the contents.

# You can do this by setting binary=False when calling the save_word2vec_format() function, for example:

# model_test.wv.save_word2vec_format('model_test.bin')

In [None]:
# The saved model can then be loaded again by calling the Word2Vec.load() function. For example:
# model = Word2Vec.load('model_test.bin')

***

## Example codes from tutorial
#### https://machinelearningmastery.com/develop-word-embeddings-python-gensim/

***

In [None]:
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from matplotlib import pyplot
# define training data
sentences = [['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec'],
			['this', 'is', 'the', 'second', 'sentence'],
			['yet', 'another', 'sentence'],
			['one', 'more', 'sentence'],
			['and', 'the', 'final', 'sentence']]
# train model
model = Word2Vec(sentences, min_count=1)
# fit a 2d PCA model to the vectors
X = model[model.wv.vocab]
pca = PCA(n_components=2)
result = pca.fit_transform(X)
# create a scatter plot of the projection
pyplot.scatter(result[:, 0], result[:, 1])
words = list(model.wv.vocab)
for i, word in enumerate(words):
	pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
pyplot.show()

In [None]:
import os

# specify the main corpus path. This will be used throughout the script
MACHINE_LEARNING = "C:/Users/renswilderom/Documents/Machine learning"

#Specify working directory
os.chdir(MACHINE_LEARNING)

## Stanford GloVe embedding

In [None]:
from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = 'glove.6B.100d.txt'
word2vec_output_file = 'glove.6B.100d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)

In [None]:
from gensim.models import KeyedVectors
# load the Stanford GloVe model
filename = 'glove.6B.100d.txt.word2vec'
model = KeyedVectors.load_word2vec_format(filename, binary=False)
# calculate: (king - man) + woman = ?
result = model.most_similar(positive=['normal', 'child'], negative=['school'], topn=1)
print(result)