# Metadata

```yaml
Topic:     Word2Vec
Author:    Qing Liu
Date:      04/30/2023
```

**Purpose:** We create word embeddings with transcripts data using word2vec and visualize results with tSNE.

# Set Up

In [81]:
data_in = '/Users/poppy/Documents/TeachSim Project/Robertson_Semantic/Data/output'
data_out = '/Users/poppy/Documents/TeachSim Project/Robertson_Semantic/Data/output'
local_lib = '/Users/poppy/Documents/TeachSim Project/Robertson_Semantic/lib'

In [82]:
OHCO = OHCO = ['document_id', 'paragraph_id', 'sentence_id', 'token_id']
PARA = OHCO[:2] # Paragraphs
SENT = OHCO[:3] # Sentences
BAG = PARA

In [90]:
# word2vec parameters
w2v_params = dict(
    window = 5,
    vector_size = 200,
    min_count = 30,
    workers = 4
)

In [91]:
import pandas as pd
import numpy as np
from gensim.models import word2vec
from sklearn.manifold import TSNE
import plotly_express as px

# Import `TOKENS` and convert to `DOCS` for Gensim

We import data from the TOKEN table of the novels corpus, excluding proper nouns.

In [92]:
TOKENS = pd.read_csv(f'{data_in}/CORPUS.csv').set_index(OHCO)

In [93]:
VOCAB = pd.read_csv(f'{data_in}/VOCAB.csv').set_index('term_str')

In [94]:
TOKENS.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,term_str,POS,speaker,coach_flag,stopword_flag,unimportant_flag
document_id,paragraph_id,sentence_id,token_id,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
02_004_22c_100,1,1,3,tell,VB,Kristyn,1,0,0
02_004_22c_100,1,1,8,feel,VB,Kristyn,1,0,0
02_004_22c_100,1,2,2,give,VB,Kristyn,1,0,0
02_004_22c_100,1,2,5,thoughts,NNS,Kristyn,1,0,0
02_004_22c_100,1,3,3,vent,VB,Kristyn,1,0,0


In [95]:
#drop NAs from POS
TOKENS = TOKENS.dropna(subset=['POS'])

In [96]:
DOCS = TOKENS[~TOKENS.POS.str.match(r'^(NN|VB)S?$')]\
    .groupby(BAG)\
    .term_str.apply(lambda  x:  x.tolist())\
    .reset_index()['term_str'].tolist()
DOCS = [doc for doc in DOCS if len(doc) > 1] # Lose single word docs

In [97]:
DOCS

[['right', 'real', 'quick', 'lucas', 'start', 'five'],
 ['well',
  'black',
  'like',
  'saw',
  'thought',
  'interrupting',
  'like',
  '30',
  'like',
  'well',
  'hopefully',
  'said',
  'hopefully',
  'sees',
  'raising',
  'yes',
  'saw',
  'got',
  'went'],
 ['oh',
  'suppose',
  'second',
  'able',
  'really',
  'sure',
  'shifting',
  'going',
  'next',
  'makes',
  'gave',
  'asked',
  'want',
  'like',
  'one',
  'sure',
  'thinking',
  'like',
  'critical',
  'next',
  'smoothly'],
 ['thought',
  'totally',
  'fine',
  'like',
  'asked',
  'one',
  'thought',
  'nicely',
  'think',
  'went',
  'next',
  'like',
  'well',
  'know',
  'got',
  'thinks',
  'us',
  'two',
  'actually',
  'thought',
  'really',
  'great'],
 ['one', 'think', 'identified', 'need', 'correct', 'might', 'right'],
 ['oh',
  'lisa',
  'really',
  'one',
  'said',
  'well',
  'clear',
  'also',
  'stated',
  'one',
  'cannot',
  'exact',
  'one',
  'covered',
  'could',
  'blown'],
 ['paragraph', 'four'

# Generate word embeddings with Gensim's library

In [98]:
model = word2vec.Word2Vec(DOCS, **w2v_params)

# Visualize with tSNE

## Get model coordinates to plot

In [99]:
coords = pd.DataFrame(
    dict(
        vector = [model.wv.get_vector(w) for w in model.wv.key_to_index], 
        term_str = model.wv.key_to_index.keys()
    )).set_index('term_str')

In [100]:
# coords

In [101]:
# TFM = coords.apply(lambda x: pd.Series(x.vector), 1)

## Use ScikitLearn's TSNE library

In [102]:
tsne_engine = TSNE(perplexity=20, n_components=2, init='pca', n_iter=2500, random_state=23)
tsne_model = tsne_engine.fit_transform(coords.vector.to_list())


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.


The PCA initialization in TSNE will change to have the standard deviation of PC1 equal to 1e-4 in 1.2. This will ensure better convergence.



In [103]:
coords['x'] = tsne_model[:,0]
coords['y'] = tsne_model[:,1]

## Add some vocab features 

In [104]:
if coords.shape[1] == 3:
    coords = coords.merge(VOCAB.reset_index(), on='term_str')
    coords = coords.set_index('term_str')

In [105]:
coords = coords[coords.stop == 0]

In [106]:
coords

Unnamed: 0_level_0,vector,x,y,n,p,i,n_chars,max_pos,n_pos,cat_pos,...,stem_porter,stem_snowball,stem_lancaster,df,idf,tfidf_mean,tfidf_sum,tfidf_median,tfidf_max,dfidf
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
like,"[0.11971838, -0.07008712, -0.05955531, 0.22899...",20.752319,1.197536,3702,0.043972,4.507255,4,IN,8,"{'VBN', 'VBZ', 'IN', 'VBP', 'NN', 'RB', 'JJ', ...",...,like,like,lik,1338,1.554451,0.399627,1570.532532,0.0,1.554451,2079.855701
going,"[0.12923764, -0.026212031, -0.043982964, 0.137...",6.025471,-9.262965,1298,0.015418,6.019269,5,VBG,1,{'VBG'},...,go,go,going,724,2.440468,0.269174,1057.854836,0.0,2.440468,1766.898622
think,"[0.11531352, -0.046753038, -0.059350356, 0.176...",-0.752450,1.500181,1856,0.022046,5.503363,5,VBP,3,"{'VB', 'VBP', 'NN'}",...,think,think,think,1114,1.818780,0.324497,1275.274595,0.0,1.818780,2026.121009
really,"[0.11381642, -0.054221667, -0.053556327, 0.179...",19.863630,-0.115884,1081,0.012840,6.283193,6,RB,1,{'RB'},...,realli,realli,real,598,2.716312,0.215798,848.086246,0.0,2.716312,1624.354530
want,"[0.08789829, -0.031320177, -0.033304468, 0.148...",19.251364,-1.619846,1294,0.015370,6.023722,4,VBP,3,"{'VB', 'VBP', 'NN'}",...,want,want,want,720,2.448461,0.237532,933.499400,0.0,2.448461,1762.891561
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
interrupting,"[0.09201156, -0.03897764, -0.03003447, 0.13068...",-21.039642,20.524584,33,0.000392,11.316950,12,VBG,3,"{'VBG', 'NN', 'JJ'}",...,interrupt,interrupt,interrupt,28,7.132959,0.024497,96.275128,0.0,7.132959,199.722843
something,"[0.08326203, -0.029924268, -0.035703037, 0.128...",-20.421303,20.029104,505,0.005998,7.381205,9,NN,3,"{'VBG', 'NN', 'JJ'}",...,someth,someth,someth,381,3.366666,0.127329,500.402947,0.0,3.366666,1282.699902
focused,"[0.10309715, -0.040864296, -0.041540153, 0.149...",-10.998887,12.908990,30,0.000356,11.454454,7,VBN,3,"{'VBN', 'VBD', 'JJ'}",...,focus,focus,focus,27,7.185426,0.017265,67.851211,0.0,7.185426,194.006505
found,"[0.0853597, -0.027728783, -0.041656997, 0.1294...",-20.366478,21.274229,31,0.000368,11.407148,5,VBD,4,"{'VBN', 'VBP', 'NN', 'VBD'}",...,found,found,found,30,7.033423,0.021760,85.517720,0.0,7.033423,211.002690


## Plot the coordinates

### Interpretation:
The bottom cluster in the plot, which is centered around the word "Lisa", indicates the presence of discussions in a text-based scenario. In this particular scenario, Lisa is a character who serves as a point of reference for the student avatars, and the preservice teacher is responsible for providing constructive feedback to the students based on their perspectives of Lisa. An example of such a perspective is given in the original text, where Lisa is portrayed as being excited about her new job. The student avatars in the discussion are named Ava and Jasmine, and they are expressing their thoughts and opinions about Lisa based on the information provided in the text.

In contrast, for the cluster on the right, centered around the word "Ethan", indicates off-task behaviors from Ethan, such as "Ethan is drumming", "Ethan is whistling", "Ethan plays Darth Vader" 

In [116]:
px.scatter(coords.reset_index(), 'x', 'y', 
           text='term_str', 
           color='max_pos', 
           hover_name='term_str',
           title = 'Figure 7. Word cluster',
           size='tfidf_max',
           height=600).update_traces(
                mode='markers+text', 
                textfont=dict(color='black', size=14, family='Arial'),
                textposition='top center')

# Semantic Algebra

## Analogies

$A : B :: C : D? \rightarrow B - A + C = D$


In [109]:
def complete_analogy(A, B, C, n=2):
    try:
        cols = ['term', 'sim']
        return pd.DataFrame(model.wv.most_similar(positive=[B, C], negative=[A])[0:n], columns=cols)
    except KeyError as e:
        print('Error:', e)
        return None
    
def get_most_similar(positive, negative=None):
    return pd.DataFrame(model.wv.most_similar(positive, negative), columns=['term', 'sim'])

In [110]:
complete_analogy('ethan', 'drumming', 'lisa', 5)

Unnamed: 0,term,sim
0,feeling,0.985872
1,says,0.982758
2,lie,0.981539
3,pismo,0.981268
4,nervous,0.980813


In [111]:
complete_analogy('stop', 'redirecting', 'text', 5)

Unnamed: 0,term,sim
0,using,0.996231
1,probing,0.995938
2,thank,0.995526
3,thought,0.99536
4,sharing,0.995341


## Similarites

In [120]:
print("Table 2a. Similar words for 'ethan'")
get_most_similar('ethan')

Table 2a. Similar words for 'ethan'


Unnamed: 0,term,sim
0,making,0.998582
1,whistling,0.998301
2,talking,0.997219
3,quiet,0.996824
4,stop,0.996718
5,specific,0.996617
6,instead,0.996586
7,need,0.996495
8,darth,0.996325
9,drumming,0.996221


In [121]:
print("Table 2b. Similar words for 'lisa'")
get_most_similar('lisa')

Table 2b. Similar words for 'lisa'


Unnamed: 0,term,sim
0,feeling,0.991205
1,lie,0.989978
2,pizmo,0.987431
3,pismo,0.987161
4,brings,0.986692
5,nervous,0.986241
6,says,0.985507
7,relaxed,0.985117
8,22,0.984527
9,likely,0.983045


In [122]:
get_most_similar('redirecting')

Unnamed: 0,term,sim
0,immediately,0.999183
1,necessarily,0.99917
2,around,0.99913
3,timely,0.999125
4,specifically,0.999112
5,use,0.999103
6,learning,0.999099
7,working,0.999092
8,real,0.999084
9,less,0.999076


In [123]:
get_most_similar('textual')

Unnamed: 0,term,sim
0,using,0.997023
1,wrong,0.995692
2,probing,0.99565
3,text,0.99479
4,thank,0.994137
5,sharing,0.994004
6,jasmine,0.993293
7,incorrect,0.993209
8,correct,0.992814
9,get,0.992496


# Save

In [124]:
W2V = pd.DataFrame(model.wv.get_normed_vectors(), index=model.wv.key_to_index)
W2V.to_csv('{}/W2V.csv'.format(data_out))
pd.Series(DOCS).to_csv('{}/GENSIM_DOCS.csv'.format(data_out), index=False, header=False)