# Word2Vec Analysis

This notebook focuses on analysing the word2vec model we are using. This will mostly involve testing the functions given by gensim.

We first import the libaries we will need throughout the project

In [1]:
#Import graphing utilities
%matplotlib inline
import matplotlib.pyplot as plt

# Import useful mathematical libraries
import numpy as np
import pandas as pd

# Import useful Machine learning libraries
import gensim

# Import utility files
from utils import save_object,load_object

### Set model name

Before begining the rest of this project, we select a name for our model. This name will be used to save and load the files for this model

In [2]:
model_name = "model6"

### Examine word similarities

We first examine word similarities

In [4]:
model = gensim.models.Word2Vec.load('models/'+model_name+'.model')

In [5]:
model.most_similar(positive=["heartbreak"])

[('betrayal', 0.5829812288284302),
 ('loneliness', 0.5747349262237549),
 ('abandonment', 0.5463388562202454),
 ('heartache', 0.5316531658172607),
 ('sadness', 0.5248433947563171),
 ('grief', 0.4989320933818817),
 ('rejection', 0.4708385467529297),
 ('betrayals', 0.46409881114959717),
 ('hopelessness', 0.4623437523841858),
 ('anguish', 0.45902955532073975)]

In [7]:
model.most_similar(positive=["pills"])

[('sleeping_pills', 0.801128625869751),
 ('painkillers', 0.7319083213806152),
 ('muscle_relaxers', 0.7053226232528687),
 ('tylenol', 0.6959131956100464),
 ('pain_killers', 0.6918990612030029),
 ('ibuprofen', 0.6794541478157043),
 ('ambien', 0.6674091815948486),
 ('valium', 0.6650989055633545),
 ('xanax', 0.6614318490028381),
 ('advil', 0.6609443426132202)]

In [8]:
model.most_similar(positive=["knife"])

[('blade', 0.7702487111091614),
 ('kitchen_knife', 0.7561583518981934),
 ('razor', 0.7034561634063721),
 ('razor_blade', 0.6944936513900757),
 ('steak_knife', 0.6316778063774109),
 ('wrist', 0.6295967102050781),
 ('scissors', 0.6221545934677124),
 ('knives', 0.6062738299369812),
 ('butcher_knife', 0.6028643250465393),
 ('an_exacto_knife', 0.6015197038650513)]

In [9]:
model.most_similar(positive=["kitten"])

[('cat', 0.4867851734161377),
 ('kitty', 0.4587540030479431),
 ('dog', 0.43468546867370605),
 ('baby', 0.42002180218696594),
 ('pet', 0.41659820079803467),
 ('chihuahua', 0.4150254726409912),
 ('puppy', 0.4139459729194641),
 ('stuffed_animal', 0.40899857878685),
 ('german_shepherd', 0.3876553177833557),
 ('bunny', 0.38353854417800903)]

In [11]:
model.most_similar(positive=["puppy"])

[('dog', 0.6382265090942383),
 ('cat', 0.5851335525512695),
 ('dogs', 0.516486644744873),
 ('pet', 0.5114070773124695),
 ('baby', 0.4772936999797821),
 ('cats', 0.4633815884590149),
 ('chihuahua', 0.45847272872924805),
 ('pets', 0.43821343779563904),
 ('kitty', 0.43616247177124023),
 ('kitten', 0.4139459729194641)]

### Examine word relationships

We now examine information contained in word vectors relative locations

In [17]:
model.most_similar(positive=["abusive","words"],negative =["physical"])

[('emotionally_abusive', 0.4020494818687439),
 ('insensitive', 0.37914222478866577),
 ('manipulative', 0.3714224100112915),
 ('manipulative_bitch', 0.3363695442676544),
 ('both_alcoholics', 0.3348906338214874),
 ('arrogant', 0.3333490192890167),
 ('inconsiderate', 0.3329807221889496),
 ('cunt', 0.33248594403266907),
 ('hurtful', 0.33186444640159607),
 ('arrogant_asshole', 0.33185720443725586)]

In [18]:
model.most_similar(positive=["suicide","self"])

[('killing_myself', 0.5683881044387817),
 ('committing_suicide', 0.5266705751419067),
 ('killing_yourself', 0.44861310720443726),
 ('sucide', 0.44581669569015503),
 ('death', 0.4447208046913147),
 ('sucidial', 0.44398051500320435),
 ('ending', 0.4431094825267792),
 ('suicidal_thoughts', 0.43945056200027466),
 ('harming_myself', 0.4386097192764282),
 ('prevailing', 0.43555474281311035)]

In [19]:
model.most_similar(positive=["family","obligation"],negative = ["love"])

[('relatives', 0.4069085717201233),
 ('family_members', 0.40498000383377075),
 ('extended_family', 0.3911120891571045),
 ('parents', 0.3735034167766571),
 ('familly', 0.3418022692203522),
 ('siblings', 0.3276708722114563),
 ('blood_relatives', 0.3163728713989258),
 ('familial', 0.3114888370037079),
 ('immediate_family', 0.3109869360923767),
 ('families', 0.30771082639694214)]

In [20]:
model.most_similar(positive=["father","woman"],negative=["man"])

[('mother', 0.7025207281112671),
 ('grandmother', 0.618820071220398),
 ('step_father', 0.6129697561264038),
 ('mom', 0.5624592304229736),
 ('dad', 0.56070876121521),
 ('sister', 0.5603752136230469),
 ('stepfather', 0.5593303442001343),
 ('step_dad', 0.5468809604644775),
 ('grandfather', 0.5455955266952515),
 ('wife', 0.5387123227119446)]

In [6]:
model.most_similar(positive=["kitten","dog"],negative=["cat"])

[('puppy', 0.40814000368118286),
 ('stuffed_animal', 0.3996387720108032),
 ('chihuahua', 0.3905201256275177),
 ('kitty', 0.3862040042877197),
 ('dogs', 0.3770129680633545),
 ('grandchild', 0.3749309182167053),
 ('baby', 0.3676145672798157),
 ('german_shepherd', 0.36399930715560913),
 ('pet', 0.3624024987220764),
 ('princess', 0.33664271235466003)]

In [12]:
model.most_similar(positive=["i"])

[('honestly', 0.4869132936000824),
 ('and', 0.4807285666465759),
 ('they', 0.46837007999420166),
 ('succeded', 0.46467646956443787),
 ('really', 0.4528850317001343),
 ('but', 0.4521094560623169),
 ('id', 0.44709664583206177),
 ('becaus', 0.44090181589126587),
 ('do', 0.4340188503265381),
 ('just', 0.4320968985557556)]