## Importing libraries

In [2]:
import numpy as np
import pandas as pd
import nltk
import time
import pickle

In [3]:
# Import gensim and libraries

import re
import gensim
from gensim.models import Word2Vec, KeyedVectors, FastText
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import datapath

In [None]:
import nltk, re, spacy
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import wordnet as wn
nlp = spacy.load('en_core_web_sm', disable=['ner,parser'])
nlp.remove_pipe('ner')
nlp.remove_pipe('parser')

In [None]:
pd.set_option('display.max_colwidth', None)

# Word Embedding Tasks

## Task 1: Create your own word embeddings

In [None]:
fairy_data = pd.read_csv("fairy_tale.csv", low_memory=False)
fairy_data.head(1)

In [None]:
fairy_data.columns

### Number of instances in the training dataset with blank reviews

In [None]:
len(fairy_data)

In [None]:
fairy_data = chatgpt_train[~chatgpt_train["sentences"].isnull()]

### Number of instances in the training dataset without blank reviews

In [None]:
len(fairy_data)

In [None]:
fairy_data['sentences']

In [None]:
sentences=[]

# file from which to generate word embeddings
filename="event_female.csv"
with open(filename, 'rb') as file:
    for line in file:
        words=line.rstrip().lower().decode('utf-8')
        # this file is already tokenized, so we can split on whitespace
        # but first let's replace any sequence of whitespace (space, tab, newline, etc.) with single space
        words=re.sub("\s+", " ", words)
        sentences.append(words.split(" "))

model_wiki = Word2Vec(
        sentences,
        vector_size=100,
        window=5,
        min_count=2,
        workers=10)

my_trained_vectors = model_wiki.wv

# save vectors to file if you want to use them later
#my_trained_vectors.save_word2vec_format('Datasets/wordembeddings/embeddings.txt', binary=False)

### Skip gram word embeddings

In [None]:
sentences = [sentence.split() for sentence in fairy_data['sentences']]

In [None]:
skipgram_model = Word2Vec(sentences, sg=1, vector_size=100, window=5, min_count=2, workers=10)

In [None]:
trained_wv_skipgrams = skipgram_model.wv

### CBOW word embeddings

In [None]:
cbow_model = Word2Vec(sentences, sg=0, vector_size=100, window=5, min_count=2, workers=10)

In [None]:
trained_wv_cbow = cbow_model.wv

### Fast Text word embeddings

In [None]:
fasttext_model = FastText(sentences, vector_size=100, window=5, min_count=2, workers=10)

In [None]:
trained_wv_fasttext = fasttext_model.wv

In [None]:
def find_similar_words(model, word, topn=20):
    try:
        similar_words = model.wv.most_similar(word, topn=topn)
        return similar_words
    
    except KeyError:
        return []

In [None]:
word_list = ["man", "woman", "king", "queen"]

### Top 20 similar words using Skip Gram word embeddings

In [None]:
for word in word_list:
    print(f"Similar words for '{word}' using Skip-gram: \n")
    print(find_similar_words(skipgram_model, word))
    print("----------------------------------------------\n")

### Top 20 similar words using CBOW word embeddings

In [None]:
for word in word_list:
    print(f"Similar words for '{word}' using CBOW: \n")
    print(find_similar_words(cbow_model, word))
    print("----------------------------------------------\n")

### Top 20 similar words using FastText word embeddings

In [None]:
for word in word_list:
    print(f"Similar words for '{word}' using Fast Text: \n")
    print(find_similar_words(fasttext_model, word))
    print("----------------------------------------------\n")

## Task 2: Use pretrained word embeddings

In [None]:
# First we have to convert the Glove format into w2v format; this creates a new file

glove_file = "Datasets/wordembeddings/glove.6B.100d.100K.txt"
glove_in_w2v_format = "Datasets/wordembeddings/glove.6B.100d.100K.w2v.txt"
_ = glove2word2vec(glove_file, glove_in_w2v_format)

In [None]:
glove_model = KeyedVectors.load_word2vec_format("Datasets/wordembeddings/glove.6B.100d.100K.w2v.txt", binary=False)

In [None]:
def find_similar_words_glove(model, word, topn=20):
    try:
        similar_words = model.most_similar(word, topn=topn)
        return similar_words
    
    except KeyError:
        return []

In [None]:
# Find similar words for "man," "woman," "king," and "queen" in each model
word_list = ["man", "woman", "king", "queen"]

for word in word_list:
    print(f"Similar words for '{word}' using GloVe: \n")
    print(find_similar_words_glove(glove_model, word))
    print("\n")
    print("----------------------------------------------\n")

# Topic Modeling Tasks

## Task 1: Data preprocessing

In [None]:
fairy_data = pd.read_csv("fairy_tale.csv", low_memory=False)
fairy_data.head(1)

In [None]:
fairy_data = chatgpt_train[~chatgpt_train["sentences"].isnull()]

In [None]:
len(fairy_data)

In [None]:
male_chars = fairy_data[fairy_data['gender'] == 'male']
male_chars.head(1)

### Number of male characters in the dataset

In [None]:
len(male_chars)

In [None]:
female_chars = fairy_data[fairy_data['gender'] == 'female']
female_chars.head(1)

### Number of female characters in the dataset

In [None]:
len(female_chars)

## Task 2: Topic modeling for male characters.