# Training a Word Embedding model.

In [1]:
import gensim
import pandas as pd
import re

## Load word corpus from twitter

In [3]:
df1 = pd.read_csv('../dataset/preprocessed/mbti_no_urls.csv')
df1

Unnamed: 0,type,posts
0,INFJ,' and intj moments sportscenter not top t...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,"'Good one _____ course, to which I say I ..."
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...
...,...,...
8670,ISFP,' just because I always think of cats as Fi d...
8671,ENFP,'So...if this thread already exists someplace ...
8672,INTP,'So many questions when i do these things. I ...
8673,INFP,'I am very conflicted right now when it comes ...


In [4]:
df2 = pd.read_csv('../dataset/twitter_dataset.csv')
df2

Unnamed: 0,Tweet_ID,Username,Text,Retweets,Likes,Timestamp
0,1,julie81,Party least receive say or single. Prevent pre...,2,25,2023-01-30 11:00:51
1,2,richardhester,Hotel still Congress may member staff. Media d...,35,29,2023-01-02 22:45:58
2,3,williamsjoseph,Nice be her debate industry that year. Film wh...,51,25,2023-01-18 11:25:19
3,4,danielsmary,Laugh explain situation career occur serious. ...,37,18,2023-04-10 22:06:29
4,5,carlwarren,Involve sense former often approach government...,27,80,2023-01-24 07:12:21
...,...,...,...,...,...,...
9995,9996,ntate,Agree reflect military box ability ever hold. ...,81,86,2023-01-15 11:46:20
9996,9997,garrisonjoshua,Born which push still. Degree sometimes contro...,73,100,2023-05-06 00:46:54
9997,9998,adriennejackson,You day agent likely region. Teacher data mess...,10,62,2023-02-27 14:55:08
9998,9999,kcarlson,Guess without successful save. Particular natu...,21,60,2023-01-09 16:09:35


## rename columns and merge to get one dataset

In [5]:
df2_renamed = df2.rename(columns={"Text":"posts"})
df2_renamed


Unnamed: 0,Tweet_ID,Username,posts,Retweets,Likes,Timestamp
0,1,julie81,Party least receive say or single. Prevent pre...,2,25,2023-01-30 11:00:51
1,2,richardhester,Hotel still Congress may member staff. Media d...,35,29,2023-01-02 22:45:58
2,3,williamsjoseph,Nice be her debate industry that year. Film wh...,51,25,2023-01-18 11:25:19
3,4,danielsmary,Laugh explain situation career occur serious. ...,37,18,2023-04-10 22:06:29
4,5,carlwarren,Involve sense former often approach government...,27,80,2023-01-24 07:12:21
...,...,...,...,...,...,...
9995,9996,ntate,Agree reflect military box ability ever hold. ...,81,86,2023-01-15 11:46:20
9996,9997,garrisonjoshua,Born which push still. Degree sometimes contro...,73,100,2023-05-06 00:46:54
9997,9998,adriennejackson,You day agent likely region. Teacher data mess...,10,62,2023-02-27 14:55:08
9998,9999,kcarlson,Guess without successful save. Particular natu...,21,60,2023-01-09 16:09:35


In [6]:
df_corpous = pd.concat([df1['posts'], df2_renamed['posts']]).to_frame(name='posts')
df_corpous

Unnamed: 0,posts
0,' and intj moments sportscenter not top t...
1,'I'm finding the lack of me in these posts ver...
2,"'Good one _____ course, to which I say I ..."
3,"'Dear INTP, I enjoyed our conversation the o..."
4,'You're fired.|||That's another silly misconce...
...,...
9995,Agree reflect military box ability ever hold. ...
9996,Born which push still. Degree sometimes contro...
9997,You day agent likely region. Teacher data mess...
9998,Guess without successful save. Particular natu...


## Remove pipe character from the dataset

In [7]:
df_corpous['posts'] = df_corpous['posts'].apply(lambda x: re.sub("\|", "", x))
df_corpous

Unnamed: 0,posts
0,' and intj moments sportscenter not top t...
1,'I'm finding the lack of me in these posts ver...
2,"'Good one _____ course, to which I say I ..."
3,"'Dear INTP, I enjoyed our conversation the o..."
4,'You're fired.That's another silly misconcepti...
...,...
9995,Agree reflect military box ability ever hold. ...
9996,Born which push still. Degree sometimes contro...
9997,You day agent likely region. Teacher data mess...
9998,Guess without successful save. Particular natu...


## Preprocess the corpus, remove stop words and punctuations

In [8]:
corpus = df_corpous['posts'].apply(gensim.utils.simple_preprocess)
corpus

0       [and, intj, moments, sportscenter, not, top, t...
1       [finding, the, lack, of, me, in, these, posts,...
2       [good, one, course, to, which, say, know, that...
3       [dear, intp, enjoyed, our, conversation, the, ...
4       [you, re, fired, that, another, silly, misconc...
                              ...                        
9995    [agree, reflect, military, box, ability, ever,...
9996    [born, which, push, still, degree, sometimes, ...
9997    [you, day, agent, likely, region, teacher, dat...
9998    [guess, without, successful, save, particular,...
9999    [body, onto, understand, team, about, product,...
Name: posts, Length: 18675, dtype: object

## Model Definition

In [10]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2,
    workers=4
)

In [11]:
model.build_vocab(corpus, progress_per=1000)

In [12]:
model.corpus_count

18675

In [14]:
model.epochs

5

## Training

In [15]:
model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)

(40266488, 52050170)

In [17]:
model.save("../models/gensim_twitter_word2vec.model")

## Testing

In [29]:
model.wv.most_similar("sweet")

[('cute', 0.7679351568222046),
 ('adorable', 0.764094889163971),
 ('awww', 0.7470967769622803),
 ('gorgeous', 0.7405201196670532),
 ('sexy', 0.7027244567871094),
 ('aww', 0.687999427318573),
 ('cuddly', 0.6878390908241272),
 ('fluffy', 0.6822195649147034),
 ('lovely', 0.6744773387908936),
 ('aw', 0.6539250016212463)]

In [36]:
similar_vec = model.wv['king'] - model.wv['man'] + model.wv['woman']

print("The result of: king - man + woman")
for result in model.wv.most_similar(positive=[similar_vec]):
    print(result[0])


The result of: king - man + woman
anne
gables
actress
anneli
elizabeth
emily
jane
princess
woman
bennet


# Train using wikipedia

In [2]:
sentences = []

with open('../dataset/wikipedia.txt', 'r') as file:
    sentences = [line.strip() for line in file]
    print(sentences[0:5])

['', 'April', '', 'April (Apr.) is the fourth month of the year in the Julian and Gregorian calendars, and comes between March and May. It is one of four months to have 30 days.', '']


In [3]:
tokenized_corpus = [gensim.utils.simple_preprocess(sentence) for sentence in sentences]
print(tokenized_corpus[0])

[]


In [4]:
model2 = gensim.models.Word2Vec(
    window=10,
    min_count=2,
    workers=4
)

In [6]:
model2.build_vocab(tokenized_corpus, progress_per=1000)

In [7]:
model2.corpus_count

2052699

In [8]:
model2.epochs

5

In [10]:
model2.train(tokenized_corpus, total_examples=model2.corpus_count, epochs=model2.epochs)

(107527243, 139687095)

In [11]:
similar_vec = model2.wv['king'] - model2.wv['man'] + model2.wv['woman']

print("The result of: king - man + woman")
for result in model2.wv.most_similar(positive=[similar_vec]):
    print(result[0])


The result of: king - man + woman
queen
king
princess
throne
consort
prince
monarch
empress
crown
heir


In [13]:
model2.wv.most_similar("sweet")

[('lemon', 0.8211219310760498),
 ('juice', 0.8011748790740967),
 ('sour', 0.7980005145072937),
 ('pineapple', 0.7959660291671753),
 ('potato', 0.7805861234664917),
 ('cake', 0.7689308524131775),
 ('spicy', 0.7681053876876831),
 ('anise', 0.7667413949966431),
 ('sweets', 0.7644729614257812),
 ('jelly', 0.763335645198822)]

In [15]:
model2.save("../models/gensim_wikipedia_word2vec.model")