In [1]:
import pandas as pd
import nltk
import string
import matplotlib.pyplot as plt
import numpy as np

from nltk.corpus import stopwords
from nltk import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
from gensim.models import Word2Vec as w2v
from sklearn.model_selection import KFold

PATH = 'input_text.txt'
sw = stopwords.words('english')

lines = []
with open(PATH, 'r') as f:
    for l in f:
        lines.append(l)

print(lines[1])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!






In [2]:
# remove new lines
lines = [line.rstrip() for line in lines]

# make all characters lower
lines = [line.lower() for line in lines]

# remove punctuations from each line
lines = [line.translate(str.maketrans('', '', string.punctuation)) for line in lines]

# tokenize
lines = [word_tokenize(line) for line in lines]

def remove_stopwords(lines, sw = sw):
    res = []
    for line in lines:
        original = line
        line = [w for w in line if w not in sw]
        if len(line) < 1:
            line = original
        res.append(line)
    return res

filtered_lines = remove_stopwords(lines = lines, sw = sw)

In [3]:
n_folds = 10

# Initialize KFold
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# Initialize list to store results
results = []

# Loop over folds
for train_index, test_index in kf.split(filtered_lines):
    # Split data into train and test sets
    train_data = [filtered_lines[i] for i in train_index]
    test_data = [filtered_lines[i] for i in test_index]

    # Train model on train data
    w1 = w2v(
        train_data,
        min_count=3,
        sg = 1,
        window=7
    )

    # Evaluate model on test data
    fold_results_1 = [r[0] for r in w1.wv.most_similar('moon')]
    print(f"Results for Train Set: {fold_results_1}")
    results.append(fold_results_1)


Results for Train Set: ['variation', 'leads', 'interactions', 'eyes', 'unique', 'reactions', 'embryos', 'core', 'components', 'pure']
Results for Train Set: ['skull', 'reactions', 'leads', 'components', 'skin', 'cure', 'mature', 'eyes', 'suggesting', 'chains']
Results for Train Set: ['method', 'eyes', 'leads', 'object', 'distinguish', 'substrate', 'symbolic', 'variable', 'embryos', 'interpreted']
Results for Train Set: ['leads', 'hidden', 'eyes', 'detail', 'skin', 'usually', 'interactions', 'attendants', 'variation', 'means']
Results for Train Set: ['leads', 'eyes', 'substrate', 'useful', 'attendants', 'cure', 'interactions', 'skin', 'components', 'breast']
Results for Train Set: ['variation', 'eyes', 'flesh', 'skin', 'leads', 'sequence', 'taste', 'object', 'pocket', 'mature']
Results for Train Set: ['supernatural', 'detail', 'gives', 'therapy', 'difference', 'breaks', 'object', 'allows', 'looks', 'rather']
Results for Train Set: ['variation', 'leads', 'hidden', 'eyes', 'attendants', '

In [4]:
w2 = w2v(
        filtered_lines,
        min_count=3,
        sg = 1,
        window=7
    )

# Evaluate model on test data
fold_results_2 = [r[0] for r in w2.wv.most_similar('moon')]
print(f"The Actual Fold_results are {fold_results_2}")
error_sum = 0
for i in results:
  err = len(list(set(i) & set(fold_results_2)))/10
  print(f"Error for this set is {err}")
  error_sum += err


The Actual Fold_results are ['leads', 'variation', 'attendants', 'detail', 'variable', 'might', 'riding', 'method', 'skull', 'object']
Error for this set is 0.2
Error for this set is 0.2
Error for this set is 0.4
Error for this set is 0.4
Error for this set is 0.2
Error for this set is 0.3
Error for this set is 0.2
Error for this set is 0.3
Error for this set is 0.4
Error for this set is 0.4


In [5]:

# Print average result over all folds
print(f"The average error over the training sets : {error_sum/n_folds}")

emb_df = (
    pd.DataFrame(
        [w2.wv.get_vector(str(n)) for n in w2.wv.key_to_index],
        index = w2.wv.key_to_index
    )
)
print(emb_df.shape)

The average error over the training sets : 0.3
(16323, 100)
