In [None]:
import pandas as pd
import spacy
import gensim
import gensim.downloader as api
import json
import matplotlib.pyplot as plt

In [None]:
inputf = "../input/pride-prejudice-clean-dataset/pride_prejudice.csv"
df = pd.read_csv(inputf, delimiter=",", header=0)
df.head()

In [None]:
list_movie_2005= []

for lista in df["Pride and Prejudice (2005)"]:
    list_movie_2005.append(lista)
    
list_movie_2005    

In [None]:
print(len(list_movie_2005 ))

In [None]:
def remove_punc2(text):
    punctuation = '!@#$%^&*()_-+={}[]:;"\'|<>,.?/~`'
    return ''.join(character for character in text
                   if character not in punctuation)

In [None]:
strings_list=[]
for sentance in list_movie_2005:
    strings_list.append(remove_punc2(str(sentance)))

In [None]:
strings_list

In [None]:
print(len(strings_list))

In [None]:
def cleaning(doc):
    txt = [token.lemma_.lower() for token in doc if not token.is_stop and len(token) > 1] 
    if len(txt) > 2: 
        return " ".join(txt)

In [None]:
def process(df):

    docs = [row for row in strings_list] 
    txts = [cleaning(doc) for doc in nlp.pipe(docs, batch_size=500, n_process=-1)] 
    sentences = [row.split() for row in txts if row != None] 
    return sentences
    
        

In [None]:
nlp = spacy.load("en_core_web_sm", disable = ["ner","parser"])

In [None]:
sentences = process(strings_list)

In [None]:
sentences

In [None]:
len(sentences)

In [None]:
from gensim.models import Word2Vec

model = Word2Vec(
    min_count = 1,  
    window = 3, 
    workers = 4  
)

model.build_vocab(sentences, progress_per = 50)

In [None]:
model.train(sentences, total_examples = model.corpus_count, epochs = 30)



In [None]:
model_savename = "movie_2005.w2v"
model.wv.save(model_savename)

In [None]:
from gensim.models import KeyedVectors

model = KeyedVectors.load("movie_2005.w2v")

In [None]:
model.vocab # generate the list of vocabulary. This will work in Jupyter Notebook

In [None]:
# Create dataframe of the frequency of vocabulary
records = list()


for word in model.vocab:
    records.append((word, model.vocab[word].count))


model_vocab_df = pd.DataFrame.from_records(records, columns = ["lemma","frequency"])
model_vocab_df.sort_values("frequency", ascending=False)

In [None]:
for word in model_vocab_df['lemma']:
    print(word)

In [None]:
def naive_projection(x_axis, y_axis, test_words, model, plot_size=10):


  
    if len(x_axis) != 2:
        print("You must only have two antonyms in your x-axis")
    elif len(y_axis) !=2:
        print("You must only have two antonyms in your y-axis")
    else:


        x = list() 
        y = list()

    
    for word in test_words:
        x_val = model.distance(x_axis[0], word) - model.distance(x_axis[1], word) 
        y_val = model.distance(y_axis[0], word) - model.distance(y_axis[1], word) 
        x.append(x_val) 
        y.append(y_val)
    
   

    fig, ax = plt.subplots(figsize=(plot_size,plot_size))

    for i in range(len(x)):
        ax.scatter(x[i], y[i])
        ax.annotate(test_words[i], (x[i], y[i]))
    
    xlab = x_axis[0] + " --- " + x_axis[1]
    ylab = y_axis[0] + " --- " + y_axis[1]
    plt.xlabel(xlab)
    plt.ylabel(ylab)

    plt.show()

In [None]:
x_axis = ["husband", "wife"]
y_axis = ["rich","poor"]
test_words = ["elizabeth", "bennet","darcy", "bourgh", "jane", "bingley", "caroline", "wickham", "lydia", "charlotte", "georgiana", "mary", "catherine"] 

naive_projection(x_axis, y_axis, test_words, model)

In [None]:
x_axis = ["man","woman"]
y_axis = ["excellence", "inferiority"]
test_words = ["elizabeth", "bennet", "darcy", "bourgh", "jane", "bingley", "caroline", "wickham", "lydia", "charlotte", "georgiana", "mary", "catherine"] 

naive_projection(x_axis, y_axis, test_words, model)

In [None]:
x_axis = ["mr","madam"]
y_axis = ["superior","inferior"]
test_words = ["elizabeth", "bennet", "darcy", "bourgh", "jane", "bingley", "caroline", "wickham", "lydia", "charlotte", "georgiana", "mary", "catherine"] 

naive_projection(x_axis, y_axis, test_words, model)

In [None]:
x_axis = ["brother","sister"]
y_axis = ["master","servant"]
test_words = ["elizabeth", "bennet", "darcy", "bourgh", "jane", "bingley", "caroline", "wickham", "lydia", "charlotte", "georgiana", "mary", "catherine"] 

naive_projection(x_axis, y_axis, test_words, model)

In [None]:
def advanced_projection(x_dimensions, y_dimensions, test_words, model, plot_size=8, xlab="label", ylab = "label"):
    x = list()
    y = list()

    for word in test_words:
        x_vals = list()
        y_vals = list()

        for dim in x_dimensions:
            xval = model.distance(dim[0], word) - model.distance(dim[1], word)
            x_vals.append(xval)
    
        for dim in y_dimensions:
            yval = model.distance(dim[0], word) - model.distance(dim[1], word)
            y_vals.append(yval)

    
        xavg = statistics.mean(x_vals)
        yavg = statistics.mean(y_vals)

  
        x.append(xavg)
        y.append(yavg)


    
    fig, ax = plt.subplots(figsize=(plot_size,plot_size))
    for i in range(len(x)):
        ax.scatter(x[i], y[i])
        ax.annotate(test_words[i], (x[i], y[i]))
        
    plt.xlabel(xlab)
    plt.ylabel(ylab)
    plt.show()

In [None]:
import statistics

x_dimensions =  [
                ["husband", "wife"],
                ["man","woman"],
                ["sir","madam"],
                ["brother", "sister"],
                ["mr", "miss"],
                ["mr", "mrs"],
                ["gentleman", "lady"],
                ["father", "mother"]
]


y_dimensions = [
                ["rich","poor"],
                ["master", "servant"],
                ["fortune", "misfortune"]
]


test_word = ["elizabeth","bennet", "darcy", "bourgh", "jane", "bingley", "caroline", "wickham", "lydia", "gardiner", "charlotte", "georgiana", "mary", "catherine"] 


advanced_projection(
    x_dimensions,
    y_dimensions,
    test_word,
    model,
    plot_size=10,
    xlab = "maleness -- femaleness",
    ylab = "richness -- poorness"
)