In [None]:
import spacy
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('../input/pride-prejudice-clean-dataset/pride_prejudice(1995).csv', delimiter = ',')
df

In [None]:
def cleaning(doc):
    txt = [token.lemma_.lower() for token in doc if not token.is_stop and len(token) > 1] # this line tokenizes, takes out stopwords, and returns lemmas. We also lowercase everything (works for english, not necessarily other langauges) and also only take words with more than one letter to get rid of punctuation. There's smarter ways to do that -- but this is meant to be simplistic!!
    if len(txt) > 2: # dump any sentences with less than 2 words -- word2vec is based on collocation, after all...
        return " ".join(txt)

def process(df):
    docs = [row for row in df["Pride and Prejudice (1995)"]] # get your documents as a list of individual texts -- if you get a KeyError you might have to check your dataframe
    txts = [cleaning(doc) for doc in nlp.pipe(docs, batch_size=500, n_process = -1)] # nlp.pipe is some spacy magic -- this is where the processing happens
    sentences = [row.split() for row in txts if row != None] # we need to return the cleaned data (from cleaning(doc)) as sentences for Gensim
    return sentences # this is what we pass to Gensim

In [None]:
nlp = spacy.load("en_core_web_sm", disable = ["ner","parser"])

## now process our data
sentences = process(df)

In [None]:
## first we initialize the model
from gensim.models import Word2Vec

model = Word2Vec(
    min_count = 1, ## you can mess around with these parameters (don't mess with workers though!); min_count refers to words that appear at least N times 
    window = 4, ## window refers to the size of the window +/- N words for something to count as a collocation
    workers = 4 ## leave this one alone...refers to the number of CPUs/threads to use. This works with Google Colab so just leave it alone for now. 
)

model.build_vocab(sentences, progress_per = 50) # remember that sentences is just what we output after spacy did its work in the process(df) function

In [None]:
## now we train the model on our data

model.train(sentences, total_examples = model.corpus_count, epochs = 30)

## This might take quite some time again

In [None]:
# first, save the model

model_savename = "model95.w2v"
model.wv.save(model_savename)

In [None]:
## ok now the model is saved, so lets reload it
## we need to import a helper function from Gensim to reload the model

from gensim.models import KeyedVectors

model = KeyedVectors.load("model95.w2v")

In [None]:
model.vocab # generate the list of all vocabulary within the dataset

In [None]:
## let's make a table of our vocabulary with frequency

records = list()

## iterate over every word in the vocab and get its frequency and save it to records
for word in model.vocab:
    records.append((word, model.vocab[word].count))

## make a table with some pandas magic
model_vocab_df = pd.DataFrame.from_records(records, columns = ["lemma","frequency"])
model_vocab_df.sort_values("frequency", ascending=False)

In [None]:
def naive_projection(x_axis, y_axis, test_words, model, plot_size=10):


  
    if len(x_axis) != 2:
        print("You must only have two antonyms in your x-axis")
    elif len(y_axis) !=2:
        print("You must only have two antonyms in your y-axis")
    else:


        x = list() 
        y = list()

    
    for word in test_words:
        x_val = model.distance(x_axis[0], word) - model.distance(x_axis[1], word) 
        y_val = model.distance(y_axis[0], word) - model.distance(y_axis[1], word) 
        x.append(x_val) 
        y.append(y_val)
    
   

    fig, ax = plt.subplots(figsize=(plot_size,plot_size))

    for i in range(len(x)):
        ax.scatter(x[i], y[i])
        ax.annotate(test_words[i], (x[i], y[i]))
    
    xlab = x_axis[0] + " --- " + x_axis[1]
    ylab = y_axis[0] + " --- " + y_axis[1]
    plt.xlabel(xlab)
    plt.ylabel(ylab)

    plt.show()

In [None]:
x_axis = ["mr", "miss"]
y_axis = ["rich","poor"]
test_words = ["elizabeth", "bennet", "fitzwilliam", "darcy", "bourgh", "jane", "bingley", "caroline", "wickham", "lydia", "gardiner", "william", "charlotte", "georgiana", "mary", "catherine"] 

naive_projection(x_axis, y_axis, test_words, model)

In [None]:
x_axis = ["man","woman"]
y_axis = ["fortune","debt"]
test_words = ["elizabeth", "bennet", "fitzwilliam", "darcy", "bourgh", "jane", "bingley", "caroline", "wickham", "lydia", "gardiner", "william", "charlotte", "georgiana", "mary", "catherine"] 

naive_projection(x_axis, y_axis, test_words, model)

In [None]:
x_axis = ["gentleman","lady"]
y_axis = ["lord","servant"]
test_words = ["elizabeth", "bennet", "fitzwilliam", "darcy", "jane", "bingley", "caroline", "wickham", "lydia", "gardiner", "william", "charlotte", "georgiana", "mary",  "bourgh","catherine"] 

naive_projection(x_axis, y_axis, test_words, model)

In [None]:
x_axis = ["sister", "brother"]
y_axis = ["honour", "disgrace"]
test_words = ["elizabeth", "bennet", "fitzwilliam", "darcy", "bourgh", "jane", "bingley", "caroline", "wickham", "lydia", "gardiner", "william", "charlotte", "georgiana", "mary", "catherine"] 

naive_projection(x_axis, y_axis, test_words, model)

In [None]:
def advanced_projection(x_dimensions, y_dimensions, test_words, model, plot_size=10, xlab="label", ylab = "label"):
    x = list()
    y = list()

    for word in test_words:
        x_vals = list()
        y_vals = list()

        for dim in x_dimensions:
            xval = model.distance(dim[0], word) - model.distance(dim[1], word)
            x_vals.append(xval)
    ## repeat for y values
        for dim in y_dimensions:
            yval = model.distance(dim[0], word) - model.distance(dim[1], word)
            y_vals.append(yval)

    ## ok now we need to take the average of all the x_vals and y_vals we collected for this word
        xavg = statistics.mean(x_vals)
        yavg = statistics.mean(y_vals)

    ## now lets save this to our x and y lists that we set up above (outside the for word in test_words loop) so that we can plot the word
        x.append(xavg)
        y.append(yavg)


    
    fig, ax = plt.subplots(figsize=(plot_size,plot_size))
    for i in range(len(x)):
        ax.scatter(x[i], y[i])
        ax.annotate(test_words[i], (x[i], y[i])) 
    plt.xlabel(xlab)
    plt.ylabel(ylab)
    plt.show()

In [None]:
import statistics

x_dimensions =  [
                ["husband", "wife"],
                ["man","woman"],
                ["father","mother"],
                ["gentleman", "lady"],
                ["mr", "miss"],
                ["mr", "mrs"],
                ["brother", "sister"],
                ["sir", "madam"]
]


y_dimensions = [
                ["rich","poor"],
                ["master","servant"],
                ["fortune", "misfortune"]
]

test_words = ["elizabeth", "bennet", "fitzwilliam", "darcy", "jane", "bingley","wickham", "lydia", "gardiner", "william", "charlotte", "georgiana", "mary", "caroline", "bourgh","catherine"] 
xlab = "maleness -- femaleness"
ylab = "richness -- poorness"

advanced_projection(
    x_dimensions,
    y_dimensions,
    test_words,
    model,
    plot_size=10,
    xlab = xlab,
    ylab = ylab
)

In [None]:
x_dimensions =  [
                ["husband", "wife"],
                ["man","woman"],
                ["father","mother"],
                ["gentleman", "lady"],
                ["mr", "miss"],
                ["mr", "mrs"],
                ["brother", "sister"],
                ["sir", "madam"]
]


y_dimensions = [
                ["rich","poor"],
                ["master","servant"],
                ["fortune", "misfortune"]
]

test_words = ["elizabeth", "bennet", "fitzwilliam", "darcy", "jane", "bingley","wickham", "lydia", "gardiner", "william", "charlotte", "georgiana", "mary", "caroline"] 
xlab = "maleness -- femaleness"
ylab = "richness -- poorness"

advanced_projection(
    x_dimensions,
    y_dimensions,
    test_words,
    model,
    plot_size=10,
    xlab = xlab,
    ylab = ylab
)