In [2]:
import gzip
import os
import tensorflow as tf
import numpy as np

## paths to files. Do not change this
simInputFile = "Q1/word-similarity-dataset"
analogyInputFile = "Q1/word-analogy-dataset"
vectorgzipFile = "Q1/glove.6B.300d.txt.gz"
vectorTxtFile = "Q1/glove.6B.300d.txt"   # If you extract and use the gz file, use this.
analogyTrainPath = "Q1/wordRep/"
simOutputFile = "Q1/simOutput.csv"
simSummaryFile = "Q1/simSummary.csv"
anaSOln = "Q1/analogySolution.csv"
Q4List = "Q4/wordList.csv"




In [3]:
# Similarity Dataset
simDataset = [item.split(" | ") for item in open(simInputFile).read().splitlines()]
# Analogy dataset
analogyDataset = [[stuff.strip() for stuff in item.strip('\n').split('\n')] for item in open(analogyInputFile).read().split('\n\n')]

def vectorExtract(simD = simDataset, anaD = analogyDataset, vect = vectorgzipFile):
    simList = [stuff for item in simD for stuff in item]
    analogyList = [thing for item in anaD for stuff in item[0:4] for thing in stuff.split()]
    #simList.extend(analogyList)
    wordList = set(simList)
    print(len(wordList))
    wordDict = dict()
    
    vectorFile = gzip.open(vect, 'r')
    for line in vectorFile:
        if line.split()[0].strip().decode("utf-8") in wordList:
            wordDict[line.split()[0].strip().decode('utf-8')] = line.split()[1:]
    
    
    vectorFile.close()
    print('retrieved', len(wordDict.keys()))
    return wordDict, simList

# Extracting Vectors from Analogy and Similarity Dataset

In [5]:
def getaffixdict(filename = Q4List):
    fin = open(filename, "r")
    fin.readline()
    affix = {}
    i = 0
    for line in fin:
        line = line.strip().split(',')
        if line[1] not in affix:
            affix[line[1]] = i
            i = i+1
    return affix
    #print(affix['meter'], len(affix.keys()))
def getaffixvector(filename = Q4List):
    affixdict = getaffixdict(filename)
    n = len(affixdict)
    ad = {}
    for affix in affixdict:
        ad[affix] = [0]*n
        ad[affix][affixdict[affix]] = 1
        #print(affix, affixdict[affix])
    return ad
    #print(ad)

affixdict = getaffixvector()
#affixvector()

In [6]:
def getfastvectors(filename = 'Q4/fastText_vectors.txt'):
    #txw, affix, tyw = maketraindata()
    xvec = []
    yvec = []
    fin = open(filename, 'r')
    vectors = {}
    for line in fin:
        line = line.strip().split()
        line = [item.strip() for item in line]
        v = [float(item) for item in line[1:]]
        vectors[line[0]] = v
    return vectors
    
def getlvec(filename = 'Q4/vector_lazaridou.txt'):
    #txw, affix, tyw = maketraindata()
    xvec = []
    yvec = []
    fin = open(filename, 'r')
    vectors = {}
    for line in fin:
        line = line.replace('[', ',')
        line = line.replace(']', '')
        line = line.strip().split(',')
        line = [item.strip() for item in line]
        v = [float(item) for item in line[1:]]
        vectors[line[0]] = v
    return vectors

fastvectors = getfastvectors()
lvectors = getlvec()

def hasvector(line, fastvectors = fastvectors, lvectors = lvectors):
        line = line.strip().split(',')
        if(line[2] not in fastvectors or line[3] not in fastvectors or line[2] not in lvectors or line[3] not in lvectors):
            return False
        return True
        
def maketraindata(filename = Q4List):
    fin = open(filename, "r")
    fin.readline()
    trainx = []
    trainy = []
    affix = []
    i = 0
    for line in fin:
        if(hasvector(line)):
            line = line.strip().split(',')
            affix.append(line[1])
            trainx.append(line[3])
            trainy.append(line[2])
    return trainx, affix, trainy


In [7]:
def get_nninput(vectype, affixdict = affixdict, fastvectors = fastvectors, lvectors = lvectors):
    trainx, affix, trainy = maketraindata()
    affixvec = np.array([affixdict[item] for item in affix])
    print(affixvec.shape)
    trainxfast = np.array([fastvectors[item] for item in trainx])
    trainyfast = np.array([fastvectors[item] for item in trainy])
    trainxfast = np.hstack((trainxfast, affixvec))
    print(trainxfast.shape, trainyfast.shape)

    trainxl = np.array([lvectors[item] for item in trainx])
    trainyl = np.array([lvectors[item] for item in trainy])
    trainxl = np.hstack((trainxl,affixvec)) 
    trainlen = len(trainxfast)
    split = int(0.8*trainlen)
    if(vectype == "fast"):
        return trainxfast[1:split,], trainxfast[split:trainlen,], trainyfast[1:split,], trainyfast[split:trainlen,]
    elif(vectype == "lazarido"):
        return trainxl[1:split], trainxl[split:trainlen], trainyl[1:split], trainyl[split:trainlen]
    else:
        print("Vector type not specified correctly")
    print(trainxl.shape, trainyl.shape)



In [9]:
# Implementation of a simple MLP network with one hidden layer. Tested on the iris data set.
# Requires: numpy, sklearn>=0.18.1, tensorflow>=1.0

import tensorflow as tf
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split

RANDOM_SEED = 42
tf.set_random_seed(RANDOM_SEED)


def init_weights(shape):
    """ Weight initialization """
    weights = tf.random_normal(shape, stddev=0.1)
    return tf.Variable(weights)

def forwardprop(X, w_1, w_2):
    """
    Forward-propagation.
    """
    h    = tf.nn.sigmoid(tf.matmul(X, w_1))  # The \sigma function
    yhat = tf.matmul(h, w_2)  # The \varphi function
    return yhat

def network(vectype):
    train_X, test_X, train_y, test_y = get_nninput(vectype)

    # Layer's sizes
    x_size = train_X.shape[1]   # Number of input nodes: 4 features and 1 bias
    h_size = 30                # Number of hidden nodes
    y_size = train_y.shape[1]   # Number of outcomes (3 iris flowers)

    # Symbols
    X = tf.placeholder("float", shape=[None, x_size])
    y = tf.placeholder("float", shape=[None, y_size])

    # Weight initializations
    w_1 = init_weights((x_size, h_size))
    w_2 = init_weights((h_size, y_size))

    # Forward propagation
    yhat    = forwardprop(X, w_1, w_2)
    predict = yhat
    #tf.argmax(yhat, axis=1)

    # Backward propagation
    cost    = tf.sqrt(tf.reduce_mean(tf.square(tf.subtract(y, yhat))))
    #tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=yhat))
    updates = tf.train.GradientDescentOptimizer(0.01).minimize(cost)

    # Run SGD
    sess = tf.Session()
    init = tf.global_variables_initializer()
    sess.run(init)

    for epoch in range(2):
        # Train with each example
        for i in range(len(train_X)):
            sess.run(updates, feed_dict={X: train_X[i: i + 1], y: train_y[i: i + 1]})
        
    p = sess.run(predict, feed_dict = {X: test_X, y: test_y})
    print(len(p))
    '''
        train_accuracy = np.mean(np.argmax(train_y, axis=1) ==
                                 sess.run(predict, feed_dict={X: train_X, y: train_y}))
        test_accuracy  = np.mean(np.argmax(test_y, axis=1) ==
                                 sess.run(predict, feed_dict={X: test_X, y: test_y}))
        
        print("Epoch = %d, train accuracy = %.2f%%, test accuracy = %.2f%%"
              % (epoch + 1, 100. * train_accuracy, 100. * test_accuracy))
    '''

    sess.close()
    return p

In [19]:
def getderivedvectors(fastvecdict = fastvectors, lvecdict = lvectors):
    ansmodelname = 'AnsModel.txt'
    anslzname = 'AnsLzaridou.txt'
    ansfname = 'AnsFastText.txt'
    ansm = open(ansmodelname, 'w')
    ansf = open(ansfname, 'w')
    anslz = open(anslzname, 'w')
    wordx, affix, wordy = maketraindata()
    trainlen = len(wordx)
    split = int(0.8*trainlen)
    testwordx = wordx[split:trainlen]
    testwordy = wordy[split:trainlen]
    train_X, test_X, train_y, test_y = get_nninput("fast")
    p1 = network("fast") #Model is being trained on FastText
    """
    Output vectors of 3 files:
    1)AnsFastText.txt - fastText vectors of derived words in wordList.csv
    2)AnsLzaridou.txt - Lazaridou vectors of the derived words in wordList.csv
    3)AnsModel.txt - Vectors for derived words as provided by the model
    """
    
    for i in range(1, len(p1)):
        #print(len(test_X[i]))
        ansm.write(testwordy[i]+' ')
        ansm.write(' '.join([str(item) for item in p1[i]]))
        ansm.write('\n')
        
        ansf.write(testwordy[i]+' ')
        ansf.write(' '.join(str(item) for item in fastvecdict[testwordy[i]])+ '\n')
        
        anslz.write(testwordy[i]+' ')
        anslz.write(' '.join(str(item) for item in lvecdict[testwordy[i]])+ '\n')
    
    #p2 = network("lazarido")
    
    


In [20]:
def derivedWOrdTask(inputFile = Q4List):
    print('hello world')
    getderivedvectors()
    """
    Output vectors of 3 files:
    1)AnsFastText.txt - fastText vectors of derived words in wordList.csv
    2)AnsLzaridou.txt - Lazaridou vectors of the derived words in wordList.csv
    3)AnsModel.txt - Vectors for derived words as provided by the model
    
    For all the three files, each line should contain a derived word and its vector, exactly like 
    the format followed in "glove.6B.300d.txt"
    
    word<space>dim1<space>dim2........<space>dimN
    charitably 256.238 0.875 ...... 1.234
    
    """
    
    """
    The function should return 2 values
    1) Averaged cosine similarity between the corresponding words from output files 1 and 3, as well as 2 and 3.
    
        - if there are 3 derived words in wordList.csv, say word1, word2, word3
        then find the cosine similiryt between word1 in AnsFastText.txt and word1 in AnsModel.txt.
        - Repeat the same for word2 and word3.
        - Average the 3 cosine similarity values
        - DO the same for word1 to word3 between the files AnsLzaridou.txt and AnsModel.txt 
        and average the cosine simialities for valuse so obtained
        
    """
    #return cosVal1,cosVal2
derivedWOrdTask()

hello world
(8075, 54)
(8075, 254) (8075, 200)
(8075, 54)
(8075, 254) (8075, 200)
1615
