In [2]:
import multiprocessing
from gensim.models import Word2Vec
import random
import sys

In [12]:
special_characters = '[\'\",.-?!"#$%&(*)+/:;<=>@\[\]\\\\^`{|}~\t\n]+' 

def tokenize(text, characters):
    # Remove the special characters
    for c in characters:
        text = text.replace(c," ")     
    # Split into tokens
    # Normalize to lower case
    tokens = text.lower().split()
    return tokens

In [13]:
def load_data(file_path):
    with open (file_path + "\\pos.txt") as f:
        pos_lines = f.readlines()
    with open (file_path + "\\neg.txt") as f:
        neg_lines = f.readlines()
    data = []
    for line in pos_lines:
        tokens = tokenize(line, special_characters)
        data.append(tokens)
    for line in neg_lines:
        tokens = tokenize(line, special_characters)
        data.append(tokens)
    random.shuffle(data)
    return data

In [14]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

In [15]:
data = load_data("C:\\Users\\andre\\msci-text-analytics-s20")

In [16]:
w2v_model = Word2Vec(data, size = 100, window = 5, min_count = 1, workers = cores-1)

In [20]:
print('Good: {}'.format(w2v_model.wv.most_similar(positive=["good"], topn=20)))

Good: [('great', 0.8456581234931946), ('decent', 0.8125674724578857), ('nice', 0.7424853444099426), ('fantastic', 0.7375355362892151), ('terrific', 0.7183125615119934), ('wonderful', 0.7039445638656616), ('superb', 0.6815990805625916), ('bad', 0.6760596632957458), ('fabulous', 0.6742970943450928), ('okay', 0.6480566263198853), ('ok', 0.6248569488525391), ('awesome', 0.6114126443862915), ('amazing', 0.6109828948974609), ('impressive', 0.6010892987251282), ('perfect', 0.5950475335121155), ('excellent', 0.5876733660697937), ('brilliant', 0.5839992761611938), ('alright', 0.5836555361747742), ('reasonable', 0.5831148624420166), ('phenomenal', 0.579258143901825)]


In [21]:
print('Bad: {}'.format(w2v_model.wv.most_similar(positive=["bad"], topn=20)))

Bad: [('horrible', 0.7059832215309143), ('good', 0.6760596632957458), ('terrible', 0.6743590235710144), ('poor', 0.640200138092041), ('awful', 0.6115643382072449), ('funny', 0.6091506481170654), ('strange', 0.6001135110855103), ('scary', 0.5823907852172852), ('pathetic', 0.5668982267379761), ('weird', 0.5657867193222046), ('horrendous', 0.5528634786605835), ('trivial', 0.5425273180007935), ('stupid', 0.5415877103805542), ('weak', 0.5332534313201904), ('spectacular', 0.5322089195251465), ('okay', 0.5309174060821533), ('horrid', 0.5286571383476257), ('fake', 0.527480959892273), ('stellar', 0.5270899534225464), ('alright', 0.5250728130340576)]


In [26]:
%run main "C:\\Users\\andre\\msci-text-analytics-s20"

Good similar:[('decent', 0.821360170841217), ('great', 0.796782374382019), ('nice', 0.741490364074707), ('fantastic', 0.7374138832092285), ('terrific', 0.7125282287597656), ('wonderful', 0.706699550151825), ('superb', 0.7065791487693787), ('bad', 0.6604238152503967), ('reasonable', 0.6395441889762878), ('fabulous', 0.6274336576461792), ('amazing', 0.613029420375824), ('impressive', 0.5980493426322937), ('excellent', 0.5959839820861816), ('awesome', 0.5862232446670532), ('poor', 0.5851634740829468), ('cool', 0.5850679874420166), ('terrible', 0.5801501274108887), ('lovely', 0.5793958902359009), ('perfect', 0.5776446461677551), ('ok', 0.5750154256820679)]
Bad similar:[('horrible', 0.6963096857070923), ('terrible', 0.6767602562904358), ('good', 0.6604238152503967), ('funny', 0.6327540874481201), ('awful', 0.625789225101471), ('poor', 0.592717707157135), ('weird', 0.5863698124885559), ('weak', 0.5855468511581421), ('strange', 0.5774862170219421), ('stupid', 0.5742812752723694), ('fake', 0.5

In [45]:
%run inference "C:\\Users\\andre\\msci-text-analytics-s20\\Assignment 3\\test.txt"

decent: [('good', 0.821360170841217), ('superb', 0.6956076622009277), ('reasonable', 0.6672420501708984), ('nice', 0.6601412892341614), ('great', 0.6571164131164551), ('terrific', 0.6545218229293823), ('fantastic', 0.642350971698761), ('adequate', 0.5844165086746216), ('wonderful', 0.582452654838562), ('poor', 0.5629048943519592), ('modest', 0.5598803162574768), ('consistent', 0.5598394274711609), ('excellent', 0.5545246601104736), ('impressive', 0.5496906042098999), ('exceptional', 0.5464913845062256), ('fair', 0.5392312407493591), ('muddy', 0.5391784906387329), ('strange', 0.5262154340744019), ('terrible', 0.5248317718505859), ('flawless', 0.5212446451187134)]
great: [('fantastic', 0.906447172164917), ('wonderful', 0.8605281710624695), ('terrific', 0.8287990689277649), ('good', 0.7967823147773743), ('nice', 0.7626060247421265), ('perfect', 0.7577698826789856), ('fabulous', 0.7502351999282837), ('awesome', 0.7160434722900391), ('excellent', 0.708631157875061), ('superb', 0.70300406217

  similar = w2v_model.similar_by_word(w, 20)
