***

# Word embedding in Python Gensim

#### based on: https://machinelearningmastery.com/develop-word-embeddings-python-gensim/

***

In [1]:
import os
path = 'C:/Users/renswilderom/Documents/Machine learning'
os.chdir(path)

In [2]:
import os 
import pandas as pd

df = pd.read_excel("C:/Users/renswilderom/Documents/Dan Silver Projects/history of sociology/stm_prep.xlsx") 

In [3]:
df.loc[:, 'TEXT'] = df.loc[:, 'TEXT'].astype(str)

In [4]:
# Split documents into sentences
import nltk
tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
df['sentences'] = df.apply(lambda row: tokenizer.tokenize(row['TEXT']), axis=1)

In [5]:
df.loc[:, 'sentences'] = df.loc[:, 'sentences'].astype(str)

In [6]:
# print(df.loc[1, 'sentences'])

In [7]:
# Tokenize sentences
df['tokenized_sentences'] = df.apply(lambda row: nltk.word_tokenize(row['sentences']), axis=1)

In [8]:
# print(df.loc[1, 'tokenized_sentences'])

***

## Slice data frame in periods

***

In [9]:
df_temp = df

In [10]:
df1 = df_temp[df_temp.YEAR < 1926] # 1895-1925

In [11]:
df1.shape

(977, 6)

In [12]:
df2 = df_temp[(df_temp.YEAR > 1925) & (df_temp.YEAR < 1956)] # 1926-1955

In [13]:
df2.shape

(2708, 6)

In [14]:
df3 = df_temp[(df_temp.YEAR > 1955) & (df_temp.YEAR < 1986)] # 1956-1985

In [15]:
df3.shape

(3934, 6)

In [16]:
df4 = df_temp[(df_temp.YEAR > 1985)] # 1986-2015

In [17]:
df4.shape

(6119, 6)

***

## Train models

***

In [18]:
from gensim.models import Word2Vec
# train model
model1 = Word2Vec(df1.tokenized_sentences, min_count=1) # 1895-1925



In [19]:
model2 = Word2Vec(df2.tokenized_sentences, min_count=1) # 1926-1955

In [20]:
model3 = Word2Vec(df3.tokenized_sentences, min_count=1) # 1956-1985

In [21]:
model4 = Word2Vec(df4.tokenized_sentences, min_count=1) # 1986-2015

***

## 10 most similar words vis-a-vis some words of interest

***

**relation**

In [22]:
model1.wv.most_similar('relation') # 1895-1925

[('relationship', 0.8441445231437683),
 ('relations', 0.7866001129150391),
 ('opposition', 0.7335350513458252),
 ('function', 0.7322505712509155),
 ('response', 0.7263873219490051),
 ('antagonism', 0.7014156579971313),
 ('reference', 0.6968504190444946),
 ('distinction', 0.688345730304718),
 ('structure', 0.6882886290550232),
 ('conflict', 0.6839989423751831)]

In [23]:
model2.wv.most_similar('relation') # 1926-1955

[('relationship', 0.8417072296142578),
 ('similarity', 0.6826012134552002),
 ('resemblance', 0.6686841249465942),
 ('association', 0.6534318923950195),
 ('relations', 0.650056004524231),
 ('link', 0.6373822689056396),
 ('contrast', 0.6302788853645325),
 ('interrelationship', 0.6288437843322754),
 ('interrelation', 0.6260203123092651),
 ('interrelationships', 0.6189782619476318)]

In [24]:
model3.wv.most_similar('relation') # 1956-1985

[('relationship', 0.8869860172271729),
 ('link', 0.7533186674118042),
 ('relationships', 0.7214893102645874),
 ('connection', 0.7156462073326111),
 ('association', 0.7111459970474243),
 ('linkage', 0.7057282328605652),
 ('tie', 0.6991400718688965),
 ('relations', 0.6889328956604004),
 ('nexus', 0.688151478767395),
 ('socially-defined', 0.6826616525650024)]

In [25]:
model4.wv.most_similar('relation') # 1986-2015

[('relationship', 0.852817952632904),
 ('connection', 0.7491021156311035),
 ('link', 0.6950960159301758),
 ('linkage', 0.6906623840332031),
 ('interplay', 0.6799286603927612),
 ('relations', 0.6672086119651794),
 ('Netherlands160', 0.6413803696632385),
 ('links', 0.637802243232727),
 ('relationships', 0.6342426538467407),
 ('interrelationship', 0.6274843215942383)]

**cultural**

In [26]:
model1.wv.most_similar('cultural') 

[('racial', 0.8538060188293457),
 ('functional', 0.823914110660553),
 ('psychic', 0.8227604031562805),
 ('psychical', 0.810051679611206),
 ('genetic', 0.8015466332435608),
 ('chemical', 0.8008708953857422),
 ('geographical', 0.7990705966949463),
 ('physiological', 0.7939574718475342),
 ('organic', 0.7920008301734924),
 ('internal', 0.7879409790039062)]

In [27]:
model2.wv.most_similar('cultural')

[('culture', 0.8146034479141235),
 ('structural', 0.791475772857666),
 ('societal', 0.7879284024238586),
 ('institutional', 0.767919659614563),
 ('functional', 0.7537322044372559),
 ('biological', 0.7528942823410034),
 ('social', 0.7403719425201416),
 ('dynamic', 0.7374778389930725),
 ('racial', 0.7342513799667358),
 ('linguistic', 0.7221019268035889)]

In [28]:
model3.wv.most_similar('cultural')

[('ideological', 0.7420690059661865),
 ('linguistic', 0.7322743535041809),
 ('distinctive', 0.7210464477539062),
 ('cognitive', 0.7178052067756653),
 ('subcultural', 0.7093836069107056),
 ('institutional', 0.7090656757354736),
 ('religious', 0.6968305110931396),
 ('culture', 0.694295346736908),
 ('secular', 0.6935019493103027),
 ('sociocultural', 0.6796746253967285)]

In [29]:
model4.wv.most_similar('cultural')

[('symbolic', 0.7528668642044067),
 ('discursive', 0.7232345342636108),
 ('institutional', 0.6804810762405396),
 ('linguistic', 0.6664934158325195),
 ('artistic', 0.655575156211853),
 ('ideological', 0.6555702686309814),
 ('culture', 0.6552258729934692),
 ('political', 0.6521793603897095),
 ('structural', 0.6398521065711975),
 ('religious', 0.6380373239517212)]

## For Dan: 
#### 'causal', 'causality', ‘objective’, ‘objectivity’, ‘subjective’, ‘subjectivity’, ‘schema’, ‘Simmel’

**causal**

In [30]:
model1.wv.most_similar('causal')

[('conditioning', 0.7963470220565796),
 ('functional', 0.7872750759124756),
 ('reciprocal', 0.7838644981384277),
 ('physiological', 0.7837982773780823),
 ('psychic', 0.7819134593009949),
 ('societary', 0.7752439975738525),
 ('quantitative', 0.774134635925293),
 ('external', 0.7653709650039673),
 ('environmental', 0.7648234367370605),
 ('cultural', 0.7598253488540649)]

In [31]:
model2.wv.most_similar('causal')

[('functional', 0.7950720191001892),
 ('situational', 0.7687092423439026),
 ('qualitative', 0.737224817276001),
 ('structural', 0.7348059415817261),
 ('genetic', 0.7337712049484253),
 ('logical', 0.7255640625953674),
 ('behavioral', 0.7108860611915588),
 ('temporal', 0.7089613080024719),
 ('interpersonal', 0.706243097782135),
 ('societal', 0.7052133083343506)]

In [32]:
model3.wv.most_similar('causal')

[('reciprocal', 0.7385190725326538),
 ('logical', 0.7241897583007812),
 ('structural', 0.7147865295410156),
 ('simultaneous', 0.6898951530456543),
 ('linear', 0.6840468049049377),
 ('sequential', 0.682436466217041),
 ('causality', 0.674077570438385),
 ('temporal', 0.6668140888214111),
 ('developmental', 0.6650990843772888),
 ('contextual', 0.6519103050231934)]

In [33]:
model4.wv.most_similar('causal')

[('generative', 0.7024340033531189),
 ('causality', 0.7003788948059082),
 ('contextual', 0.69304358959198),
 ('motivational', 0.6824032664299011),
 ('logical', 0.6732432842254639),
 ('structural', 0.6722752451896667),
 ('underlying', 0.6615617275238037),
 ('situational', 0.661487877368927),
 ('functional', 0.6401398181915283),
 ('systemic', 0.6374400854110718)]

**causality**

In [34]:
model1.wv.most_similar('causality')

[('conditions—with', 0.6643602252006531),
 ('wh', 0.6520610451698303),
 ('acids', 0.6250447034835815),
 ('™Ibid', 0.6136768460273743),
 ('optimism', 0.6132846474647522),
 ('tension', 0.613148033618927),
 ('mand', 0.6128140687942505),
 ('differences—which', 0.6061928272247314),
 ('Grave', 0.5998084545135498),
 ('suffixes', 0.5987680554389954)]

In [35]:
model2.wv.most_similar('causality')

[('dialectic', 0.7443912029266357),
 ('relativity', 0.739704966545105),
 ('gestalt', 0.733802855014801),
 ('schema', 0.7336994409561157),
 ('immanent', 0.7328548431396484),
 ('mechanistic', 0.7053650617599487),
 ('particularistic', 0.7034801244735718),
 ('role-taking', 0.6971359252929688),
 ('transcendental', 0.6966195106506348),
 ('means-end', 0.6915581226348877)]

In [36]:
model3.wv.most_similar('causality')

[('causation', 0.7593010663986206),
 ('causal', 0.6740775108337402),
 ('reciprocity', 0.6695437431335449),
 ('linkage', 0.6636263132095337),
 ('symmetry', 0.661018967628479),
 ('teleological', 0.6564818620681763),
 ('consistency', 0.6491689682006836),
 ('dissonance', 0.6416134834289551),
 ('contagion', 0.6347087621688843),
 ('closure', 0.6303196549415588)]

In [37]:
model4.wv.most_similar('causality')

[('causation', 0.8278628587722778),
 ('contingency', 0.7452189922332764),
 ('reification', 0.7374314665794373),
 ('causal', 0.7003788948059082),
 ('distortion', 0.6949384212493896),
 ('inference', 0.6869561076164246),
 ('generalization', 0.6716681122779846),
 ('ontology', 0.6602618098258972),
 ('teleological', 0.6595219969749451),
 ('reflexivity', 0.6540914177894592)]

**objective**

In [38]:
model1.wv.most_similar('objective')

[('psychological', 0.8542784452438354),
 ('biological', 0.8342928886413574),
 ('abstract', 0.8336150050163269),
 ('psychic', 0.8332881927490234),
 ('organic', 0.8310183882713318),
 ('subjective', 0.8243480324745178),
 ('concrete', 0.8170121312141418),
 ('logical', 0.8091793060302734),
 ('external', 0.8055845499038696),
 ('positive', 0.8022143244743347)]

In [39]:
model2.wv.most_similar('objective')

[('empirical', 0.7851575613021851),
 ('subjective', 0.7695214152336121),
 ('meaningful', 0.7631130814552307),
 ('quantitative', 0.7607792615890503),
 ('analytical', 0.7562923431396484),
 ('abstract', 0.7516562342643738),
 ('operational', 0.7502760887145996),
 ('measurement', 0.7403283715248108),
 ('abstraction', 0.7243492603302002),
 ('evaluation', 0.7231173515319824)]

In [40]:
model3.wv.most_similar('objective')

[('subjective', 0.6697182059288025),
 ('essential', 0.6574784517288208),
 ('ontological', 0.6436105966567993),
 ('abstract', 0.6423295736312866),
 ('motive', 0.640049397945404),
 ('object', 0.6359859108924866),
 ('cognitive', 0.6349653005599976),
 ('underlying', 0.6340020895004272),
 ('intrinsic', 0.6331543922424316),
 ('ultimate', 0.6249767541885376)]

In [41]:
model4.wv.most_similar('objective')

[('immanent', 0.7301717400550842),
 ('intrinsic', 0.7074230909347534),
 ('subjective', 0.6827552318572998),
 ('ontological', 0.677768349647522),
 ('underlying', 0.6422537565231323),
 ('existential', 0.6377660632133484),
 ('irreducible', 0.6234538555145264),
 ('illusory', 0.6212280988693237),
 ('unobservable', 0.6198253631591797),
 ('meaning', 0.6139387488365173)]

**objectivity**

In [42]:
model1.wv.most_similar('objectivity')

[('potency', 0.7457101941108704),
 ('group-life', 0.7335221171379089),
 ('expansive', 0.730195164680481),
 ('coherence', 0.7163501977920532),
 ('speculation', 0.7050509452819824),
 ('enlightenment', 0.7042633295059204),
 ('altruism', 0.7038308382034302),
 ('multiplicity', 0.7023306488990784),
 ('simplicity', 0.6998115181922913),
 ('protoplasm', 0.6968473196029663)]

In [43]:
model2.wv.most_similar('objectivity')

[('precision', 0.7891201972961426),
 ('clarification', 0.759067177772522),
 ('clarity', 0.7479539513587952),
 ('logic', 0.7403373718261719),
 ('verification', 0.7258188724517822),
 ('rationality', 0.722417950630188),
 ('motivation', 0.7221505641937256),
 ('insight', 0.7175171375274658),
 ('curiosity', 0.7163212299346924),
 ('speculation', 0.7132241129875183)]

In [44]:
model3.wv.most_similar('objectivity')

[('truth', 0.7581232190132141),
 ('canons', 0.7061920762062073),
 ('intuitive', 0.6949276924133301),
 ('universality', 0.6877827048301697),
 ('clarity', 0.6828929781913757),
 ('generality', 0.6786223649978638),
 ('coherence', 0.6779695749282837),
 ('ignorance', 0.6775922775268555),
 ('judgment', 0.677091658115387),
 ('reductionism', 0.6743451356887817)]

In [45]:
model4.wv.most_similar('objectivity')

[('universality', 0.7826473712921143),
 ('truth', 0.7760058641433716),
 ('transcendental', 0.7242398262023926),
 ('metaphysical', 0.7142008543014526),
 ('realism', 0.7102657556533813),
 ('irrationality', 0.7091241478919983),
 ('neutrality', 0.7074251174926758),
 ('disinterestedness', 0.7066991329193115),
 ('authenticity', 0.7052381038665771),
 ('relativism', 0.7046592235565186)]

**subjective**

In [46]:
model1.wv.most_similar('subjective')

[('psychic', 0.8662709593772888),
 ('biological', 0.8498037457466125),
 ('psychical', 0.8344156742095947),
 ('functional', 0.8304904699325562),
 ('objective', 0.8243480324745178),
 ('physiological', 0.808037281036377),
 ('psychological', 0.8028580546379089),
 ('genetic', 0.7957460284233093),
 ('negative', 0.7794422507286072),
 ('static', 0.7757461071014404)]

In [47]:
model2.wv.most_similar('subjective')

[('symbolic', 0.7775448560714722),
 ('objective', 0.7695214152336121),
 ('verbal', 0.752875804901123),
 ('particularistic', 0.7525017261505127),
 ('psychological', 0.7500637173652649),
 ('situational', 0.7421325445175171),
 ('rational', 0.7366851568222046),
 ('biological', 0.7233442664146423),
 ('intrinsic', 0.7225503325462341),
 ('psychic', 0.7221026420593262)]

In [48]:
model3.wv.most_similar('subjective')

[('cognitive', 0.7481263875961304),
 ('intrinsic', 0.7241697311401367),
 ('behavioral', 0.7099941372871399),
 ('affective', 0.7076915502548218),
 ('verbal', 0.6921893954277039),
 ('normative', 0.6806974411010742),
 ('fairness', 0.6782318353652954),
 ('psychological', 0.6723742485046387),
 ('objective', 0.6697181463241577),
 ('situational', 0.6695312857627869)]

In [49]:
model4.wv.most_similar('subjective')

[('situational', 0.6981050968170166),
 ('affective', 0.6927733421325684),
 ('cognitive', 0.6842795610427856),
 ('objective', 0.6827552318572998),
 ('unconscious', 0.6807843446731567),
 ('performative', 0.6802817583084106),
 ('intersubjective', 0.6655670404434204),
 ('normative', 0.660075843334198),
 ('non-rational', 0.658427357673645),
 ('expressive', 0.6557738184928894)]

**subjectivity**

In [50]:
model1.wv.most_similar('subjectivity')
# the model picks up numbers, suggesting that subjectivity was not really used frequently in this early period

[('modesty', 0.8255383372306824),
 ('Iowa.1', 0.8158161640167236),
 ('Measurement', 0.8146536946296692),
 ('Countess', 0.8115413188934326),
 ('jingoism', 0.8095638751983643),
 ('institutionalism', 0.8089417815208435),
 ('serviceability', 0.8073192834854126),
 ('uajpqqQ', 0.8045661449432373),
 ('sealed', 0.8032026290893555),
 ('Orestes', 0.802842378616333)]

In [51]:
model2.wv.most_similar('subjectivity')

[('fallibility', 0.7131853699684143),
 ('correctness', 0.7116696834564209),
 ('specificity', 0.7049423456192017),
 ('generality', 0.6749253273010254),
 ('inapplicability', 0.6672554612159729),
 ('psychopathy', 0.6617553234100342),
 ('vagueness', 0.6614216566085815),
 ('soundness', 0.6539335250854492),
 ('distortion', 0.6529852747917175),
 ('likeness', 0.6483901739120483)]

In [52]:
model3.wv.most_similar('subjectivity')

[('reification', 0.7334219217300415),
 ('intersubjectivity', 0.7204539179801941),
 ('superego', 0.6862199306488037),
 ('sociality', 0.6741387844085693),
 ('contemplation', 0.6740444302558899),
 ('irrationality', 0.6727896332740784),
 ('transcendent', 0.6678208112716675),
 ('femininity', 0.6636308431625366),
 ('sacredness', 0.6585168838500977),
 ('intersubjective', 0.658326268196106)]

In [53]:
model4.wv.most_similar('subjectivity')

[('sociality', 0.789771556854248),
 ('selfhood', 0.7740150690078735),
 ('consciousness', 0.7684578895568848),
 ('individuality', 0.7628229856491089),
 ('self', 0.7525850534439087),
 ('reflexivity', 0.7482286691665649),
 ('historicity', 0.7472134828567505),
 ('intersubjectivity', 0.740371823310852),
 ('praxis', 0.7358789443969727),
 ('embodiment', 0.7291556596755981)]

**schema**

In [54]:
model1.wv.most_similar('schema')

[('terminology', 0.7195020914077759),
 ('keynote', 0.712675154209137),
 ('mood', 0.703575611114502),
 ('scene', 0.7011400461196899),
 ('background', 0.6937474012374878),
 ('translation', 0.6920735836029053),
 ('exposition', 0.6904094219207764),
 ('formula', 0.6882789134979248),
 ('creed', 0.6820672154426575),
 ('nationalism', 0.678968071937561)]

In [55]:
model2.wv.most_similar('schema')

[('restatement', 0.7641053795814514),
 ('sequence-pattern', 0.747427225112915),
 ('means-end', 0.7419079542160034),
 ('synthesis', 0.7343186140060425),
 ('causality', 0.7336994409561157),
 ('paradigm', 0.7165526151657104),
 ('relativity', 0.710946798324585),
 ('typology', 0.6994298696517944),
 ('formulation', 0.68601393699646),
 ('scaling', 0.6852098703384399)]

In [56]:
model3.wv.most_similar('schema')

[('typology', 0.7986904382705688),
 ('scheme', 0.7977442145347595),
 ('formulation', 0.7934887409210205),
 ('epistemology', 0.7823007106781006),
 ('conceptualization', 0.7634795308113098),
 ('paradigm', 0.756039023399353),
 ('logic', 0.7507690191268921),
 ('approach', 0.749588131904602),
 ('formulations', 0.7438548803329468),
 ('methodology', 0.7434911727905273)]

In [57]:
model4.wv.most_similar('schema')

[('scheme', 0.8380883932113647),
 ('framework', 0.7660828828811646),
 ('typology', 0.7586386203765869),
 ('model', 0.7298818826675415),
 ('conceptualization', 0.7246353626251221),
 ('formulation', 0.7202170491218567),
 ('grammar', 0.7196730375289917),
 ('concept', 0.7125412821769714),
 ('definition', 0.7045414447784424),
 ('conception', 0.7020230293273926)]

**Simmel**

In [58]:
model1.wv.most_similar('Simmel')

[('Small', 0.8609300851821899),
 ('Durkheim', 0.8544952273368835),
 ('Wundt', 0.8492485284805298),
 ('Ellwood', 0.8444584012031555),
 ('Ross', 0.8423856496810913),
 ('Baldwin', 0.8358311653137207),
 ('Sumner', 0.8232913017272949),
 ('Dewey', 0.8184419870376587),
 ('Morgan', 0.8159871101379395),
 ('McDougall', 0.8030745983123779)]

In [59]:
model2.wv.most_similar('Simmel')

[('Durkheim', 0.8674623370170593),
 ('Merton', 0.8641049861907959),
 ('Karl', 0.8611164093017578),
 ('Becker', 0.8603239059448242),
 ('Malinowski', 0.8490426540374756),
 ('Herbert', 0.8468589186668396),
 ('Dewey', 0.8459092378616333),
 ('William', 0.8436660766601562),
 ('Sumner', 0.8427450060844421),
 ('Charles', 0.8427026271820068)]

In [60]:
model3.wv.most_similar('Simmel')

[('Spencer', 0.865730881690979),
 ('Weber', 0.8276360034942627),
 ('Mannheim', 0.8234241008758545),
 ('Mead', 0.8222999572753906),
 ('Comte', 0.8141987919807434),
 ('Freud', 0.7995558977127075),
 ('Malinowski', 0.7956452369689941),
 ('Durkheim', 0.792294979095459),
 ('Saint-Simon', 0.7857382297515869),
 ('Pareto', 0.7819395065307617)]

In [61]:
model4.wv.most_similar('Simmel')

[('Durkheim', 0.8434534668922424),
 ('Weber', 0.8308014869689941),
 ('Elias', 0.8087160587310791),
 ('Castoriadis', 0.7933200597763062),
 ('Hegel', 0.7855691909790039),
 ('Mannheim', 0.78470778465271),
 ('Whitehead', 0.7726262211799622),
 ('Bergson', 0.7698306441307068),
 ('Heidegger', 0.7675184607505798),
 ('Freud', 0.7644785642623901)]

***

## End of script

***

***

## notes and draft

***

* TSNE vis https://www.kaggle.com/jeffd23/visualizing-word-vectors-with-t-sne
* TSNE vis http://lvdmaaten.github.io/tsne/
* Second Tensorboard visualization demo: https://towardsdatascience.com/training-and-visualising-word-vectors-2f946c6430f8
* Word to Vec explained: http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
* Using .loc: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
* Gensim Word2Vec: https://radimrehurek.com/gensim/models/word2vec.html
* SNA in Jupyter Notebook: http://bl.ocks.org/brinrosenthal/raw/fd7d7277ce74c2b762d3a4d66326215c/
* Same but than the blog version: http://compbio.ucsd.edu/bringing-interactivity-network-visualization-jupyter-notebooks-visjs2jupyter/
* Nice word embedding blog
* Presentation Manning: https://nlp.stanford.edu/manning/talks/Simons-Institute-Manning-2017.pdf
* what about stemming?

In [None]:
# After the model is trained, it is accessible via the “wv” attribute. This is the actual word vector model in which queries can be made.

# For example, you can print the learned vocabulary of tokens (words) as follows:

words = list(model_test.wv.vocab)
print(words)

In [None]:
# You can review the embedded vector for a specific token as follows:

print(model_test['system'])

In [None]:
model_test.wv.most_similar(positive=['system', 'social'], negative=['socialism'])

In [None]:
model_test.wv.doesnt_match("system connected developed science".split())

In [None]:
model_test.wv.similarity('causal', 'relation')

In [None]:
# When getting started, you can save the learned model in ASCII format and review the contents.

# You can do this by setting binary=False when calling the save_word2vec_format() function, for example:

# model_test.wv.save_word2vec_format('model_test.bin')

In [None]:
# The saved model can then be loaded again by calling the Word2Vec.load() function. For example:
# model = Word2Vec.load('model_test.bin')

***

## Example codes from tutorial
#### https://machinelearningmastery.com/develop-word-embeddings-python-gensim/

***

In [None]:
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from matplotlib import pyplot
# define training data
sentences = [['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec'],
			['this', 'is', 'the', 'second', 'sentence'],
			['yet', 'another', 'sentence'],
			['one', 'more', 'sentence'],
			['and', 'the', 'final', 'sentence']]
# train model
model = Word2Vec(sentences, min_count=1)
# fit a 2d PCA model to the vectors
X = model[model.wv.vocab]
pca = PCA(n_components=2)
result = pca.fit_transform(X)
# create a scatter plot of the projection
pyplot.scatter(result[:, 0], result[:, 1])
words = list(model.wv.vocab)
for i, word in enumerate(words):
	pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
pyplot.show()

In [None]:
import os

# specify the main corpus path. This will be used throughout the script
MACHINE_LEARNING = "C:/Users/renswilderom/Documents/Machine learning"

#Specify working directory
os.chdir(MACHINE_LEARNING)

## Stanford GloVe embedding

In [None]:
from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = 'glove.6B.100d.txt'
word2vec_output_file = 'glove.6B.100d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)

In [None]:
from gensim.models import KeyedVectors
# load the Stanford GloVe model
filename = 'glove.6B.100d.txt.word2vec'
model = KeyedVectors.load_word2vec_format(filename, binary=False)
# calculate: (king - man) + woman = ?
result = model.most_similar(positive=['normal', 'child'], negative=['school'], topn=1)
print(result)