# Libraries

In [481]:
import re
import numpy as np
# you have to download this using
# import nltk
# nltk.download('wordnet')
from nltk.corpus import wordnet
from sklearn.preprocessing import OneHotEncoder
from sklearn.neural_network import MLPClassifier

import gensim

# Globals

In [99]:
glove = loadGloveModel("glove.840B.300d.txt")

Loading Model
Done. 2195884  words loaded!


In [387]:
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

# Helper Functions

In [557]:
# helper funstion that takes in the words and gives them back with the label, also removes the special characters
def labeling(word):
    one = "*"
    two = "!"
    
    # remove special characters
    ret = re.sub('[^A-Za-z0-9]+', '',word)
    
    if one in word:
        return ret, 1
    elif two in word:
        return ret, 2
    else:
        return ret, 3
    
# this function uses the wordnet and expands the words using synonyms 
# this exapnsion is capped at the given iteration n
# takes in a set words
def expand(words, n=5):
    expanded = set()
    
    for x in range(n):
        if len(words) == 0:
            break
        w = words.pop()
        expanded.add(w)
    
        for syn in wordnet.synsets(w): 
            for l in syn.lemmas(): 
                words.add(l.name()) 

    return list(expanded|words)

def getSimilar(words, emb, n=15):
    return [x[0] for x in emb.most_similar(positive=words, topn=n)]


def loadGloveModel(gloveFile):
    print("Loading Model")
    model = {}
    with open(gloveFile,'r') as f:
        for line in f:
            splitLine = line.split()
            
            word = splitLine[0]
            # needed for special cases such as ". . ." which we will not have in this
            try:
                float(splitLine[1])
            except:
                continue
            
            embedding = np.array([float(val) for val in splitLine[1:]])
            model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

# get embedded representations of the words
def getInputData(words, emb):
    
    out = []
    for word in words:
        # if embedding not found
        if word not in emb:
            if word.lower() not in emb:
                # unknown word given the token "UNK"
                out.append(emb['UNK'])
            else:
                out.append(emb[word.lower()])
        else:
            out.append(emb[word])
        
    return np.array(out)

# get one hot encoding of labels
def getOneHot(inlabel):
    enc = OneHotEncoder()
    out = enc.fit_transform(inlabel.reshape(-1,1)).toarray()
    return out

# print the asner from the Instalearn
def answer(pred, orig, indata):
    red = ' \033[0;31;48m'
    blue = ' \033[0;34;48m'
    norm = ' \033[0;39;48m'
    x = 0
    ans = ''
    for w in orig:
        if re.sub('[^A-Za-z0-9]+', '',w) in indata:
            if pred[x] == 3:
                ans = ans + norm + w
            elif pred[x] == 1:
                ans = ans + red + indata[x]
            else:
                ans = ans + blue + indata[x]
            x+=1
    ans = ans + norm
    return ans[1:]


def shuffle(words, labels):
    # getting curent state so shuffle is the same for both arrays
    curState = np.random.get_state()
    np.random.shuffle(words)
    
    # setting the state
    np.random.set_state(curState)
    np.random.shuffle(labels)
    
    return words, labels

# function for computing basic metric, takes in actual words and predicted words as input
def metric(w, p, mode):
    tp = 0
    for word in p:
        if word in w:
            tp+=1
    precision = tp/len(w)
    recall = tp/len(p)
    print("\n"+mode)
    print('Precision: ', precision)
    print('Recall: ', recall)
    print('F1-Score: ', (2*precision*recall)/(precision+recall))
    
    
    
# fucntion for evaluating labeled input
def evaluate(sentence, model, emb):
    raw = sentence.split(" ")
    etr = []
    elb = []
    for words in raw:
        t , l = labeling(words)
        etr.append(t)
        elb.append(l)
    etr = np.array(etr)
    
    exp1 = list(etr[[i == 1 for i in elb]])
    exp2 = list(etr[[i == 2 for i in elb]])
    
    ans = model.predict(getInputData(etr, emb))
    
    print(answer(ans, etr, etr))
    
    print("\n Evaluation Metrics")
    w1 = list(etr[[i == 1 for i in (ans)]])
    w2 = list(etr[[i == 2 for i in (ans)]])
    # print('Total Words: ', len(trainX))
    print('Total *Words: ', len(w1), w1)
    print('Actual *Words: ', exp1)
    print('Total !Words: ', len(w2), w2)
    print('Actual !Words: ', exp2)

    metric(exp1+(exp2), w1+(w2), "Overall")
    metric(exp2, w2, "!")
    metric(exp1, w1, "*")

# Getting the Input data

In [1]:
rawText = input()

I lived in *Munich last summer. *Germany has a relaxing, slow summer lifestyle. One night, I got food poisoning and couldn't find !Tylenol to make the pain go away, they insisted I take !aspirin instead.


In [538]:
rawText = input()

I used to drive around with my !dogs in my *Ferrari. But then I got a !cat and bought a *Porsche for her.


In [539]:
tk = rawText.split(" ")

In [540]:
tr = []
lb = []

# getting the words and the labels
for words in tk:
    t , l = labeling(words)
    tr.append(t)
    lb.append(l)

In [541]:
tr = np.array(tr)

In [542]:
# getting the important words to artifically augment data size
exp1 = list(tr[[i == 1 for i in lb]])
exp2 = list(tr[[i == 2 for i in lb]])


In [543]:
data = list(tr[[i == 3 for i in lb]])
labels = [3]*len(data)

# assert len(data) == len(labels)

In [547]:
# expanded from wordnet
exp = expand(set(exp1), 10)
data += exp
labels += [1]*len(exp)

# expanded from Word2Vec
exp = getSimilar((exp1), model, 15)
data += exp
labels += [1]*len(exp)

In [548]:
# expanded from wordnet
exp = expand(set(exp2), 10)
data += exp
labels += [2]*len(exp)

# expanded from Word2Vec\
exp = getSimilar((exp2), model, 15)
data += exp
labels += [2]*len(exp)

In [549]:
train, label = shuffle(data, labels)

# Preparing the training data

In [550]:
ohlabel = getOneHot(np.array(label))
trainX = getInputData(train, glove)
label = np.array(label)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [551]:
print(trainX.shape,label.shape)

(250, 300) (250,)


In [552]:
# Model Train Data
print("Original Data")
print('Total Words: ', len(tr))
print('Total *Words: ', len(exp1))
print('Total !Words: ', len(exp2))

Original Data
Total Words:  23
Total *Words:  2
Total !Words:  2


In [553]:
print("Augmented Data")
print('Total Words: ', len(trainX))
print('Total *Words: ', len(trainX[[i == 1 for i in (label)]]))
print('Total !Words: ', len(trainX[[i == 2 for i in (label)]]))

Augmented Data
Total Words:  250
Total *Words:  17
Total !Words:  214


# Training

In [554]:
# using simple perceptron because Occams Razor
mlf = MLPClassifier(hidden_layer_sizes=(250,500,150, 100, 30), activation='tanh',
                    max_iter=1500, alpha=0.0001,learning_rate="adaptive",
                    learning_rate_init=0.0001,solver='sgd', verbose=False,  
                    random_state=110,tol=0.000000000001, nesterovs_momentum=True, warm_start=True)

mlf.fit(trainX, label)



MLPClassifier(activation='tanh', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(250, 500, 150, 100, 30),
       learning_rate='adaptive', learning_rate_init=0.0001, max_iter=1500,
       momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
       power_t=0.5, random_state=110, shuffle=True, solver='sgd',
       tol=1e-12, validation_fraction=0.1, verbose=False, warm_start=True)

# Evaluating

In [558]:
# Self Evaluation
evaluate(rawText, mlf, glove)

[0;39;48mI [0;39;48mused [0;39;48mto [0;39;48mdrive [0;39;48maround [0;39;48mwith [0;39;48mmy [0;34;48mdogs [0;39;48min [0;39;48mmy [0;31;48mFerrari [0;39;48mBut [0;39;48mthen [0;39;48mI [0;39;48mgot [0;39;48ma [0;34;48mcat [0;39;48mand [0;39;48mbought [0;39;48ma [0;31;48mPorsche [0;39;48mfor [0;39;48mher [0;39;48m

 Evaluation Metrics
Total *Words:  2 ['Ferrari', 'Porsche']
Actual *Words:  ['Ferrari', 'Porsche']
Total !Words:  2 ['dogs', 'cat']
Actual !Words:  ['dogs', 'cat']

Overall
Precision:  1.0
Recall:  1.0
F1-Score:  1.0

!
Precision:  1.0
Recall:  1.0
F1-Score:  1.0

*
Precision:  1.0
Recall:  1.0
F1-Score:  1.0


In [561]:
# Evaluation with tags
evalInput = input()
evaluate(evalInput, mlf, glove)

I crashed my *Mustang but my !parrot and !snake were not in the car.
[0;39;48mI [0;39;48mcrashed [0;39;48mmy [0;31;48mMustang [0;39;48mbut [0;39;48mmy [0;34;48mparrot [0;39;48mand [0;34;48msnake [0;39;48mwere [0;39;48mnot [0;39;48min [0;39;48mthe [0;34;48mcar [0;39;48m

 Evaluation Metrics
Total *Words:  1 ['Mustang']
Actual *Words:  ['Mustang']
Total !Words:  3 ['parrot', 'snake', 'car']
Actual !Words:  ['parrot', 'snake']

Overall
Precision:  1.0
Recall:  0.75
F1-Score:  0.8571428571428571

!
Precision:  1.0
Recall:  0.6666666666666666
F1-Score:  0.8

*
Precision:  1.0
Recall:  1.0
F1-Score:  1.0


In [537]:
# was done for the given example the sheet

[0;39;48mI [0;39;48mused [0;39;48mto [0;39;48msell [0;34;48mVicodin [0;39;48min [0;31;48mTurkey [0;39;48m
Evaluation Metrics
Total *Words:  1 ['Turkey']
Actual *Words:  ['Turkey']
Total !Words:  1 ['Vicodin']
Actual !Words:  ['Vicodin']

Overall
Precision:  1.0
Recall:  1.0
F1-Score:  1.0

!
Precision:  1.0
Recall:  1.0
F1-Score:  1.0

*
Precision:  1.0
Recall:  1.0
F1-Score:  1.0


# Testing

In [529]:
testInput = input()

When I lived in Paris last year, France was experiencing a recession. The night life was too fun, I developed an addiction to Adderall and Ritalin.


In [530]:
tst = testInput.split(" ")
test = []


# getting the words and the labels
for words in tst:
    t , l = labeling(words)
    test.append(t)



In [531]:
teste = getInputData(test, glove)

In [532]:
ans = mlf.predict(teste)

print(answer(ans, tst, test))

[0;39;48mWhen [0;39;48mI [0;39;48mlived [0;39;48min [0;31;48mParis [0;39;48mlast [0;39;48myear, [0;31;48mFrance [0;39;48mwas [0;39;48mexperiencing [0;39;48ma [0;39;48mrecession. [0;39;48mThe [0;39;48mnight [0;39;48mlife [0;39;48mwas [0;39;48mtoo [0;39;48mfun, [0;39;48mI [0;39;48mdeveloped [0;39;48man [0;39;48maddiction [0;39;48mto [0;34;48mAdderall [0;39;48mand [0;34;48mRitalin
