In [21]:
import gzip
import os
import random
import tensorflow as tf
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split


## paths to files. Do not change this
simInputFile = "Q1/word-similarity-dataset"
analogyInputFile = "Q1/analogy_dataset.txt"
vectorgzipFile = "Q1/glove.6B.300d.txt.gz"
vectorTxtFile = "Q1/glove.6B.300d.txt"   # If you extract and use the gz file, use this.
analogyTrainPath = "Q1/wordRep/"
simOutputFile = "Q1/simOutput.csv"
simSummaryFile = "Q1/simSummary.csv"
anaSOln = "Q1/analogySolution.csv"
Q4List = "Q4/wordList.csv"





In [3]:
'''
Stores affixes and their position in the vector
'''
def affix_vec(filename):
    fin = open(filename,"r")
    affix_dict={}
    val=0
    fin.readline()
    for line in fin:
        line = [item.rstrip() for item in line.rstrip().split(',')]
        if line[1] not in affix_dict:
            affix_dict[line[1]] = val
            val+=1
    fin.close()
    return affix_dict
#affix_vec(Q4List)

In [4]:
'''
Given an affix , it will return the one hot vector representation  
'''
def one_hot(affix,affix_dict):
    size = len(affix_dict)
    oneHot = [0]*size
    oneHot[affix_dict[affix]] =1
    return oneHot
#affix_dict = affix_vec(Q4List)
#one_hot('wise',affix_dict)

In [6]:
'''
Prepares list of all words given in wordList.csv file
'''
def prepare_list(filename,fastvec,lvec):
    affix_list =[]
    src_list =[]
    der_list =[]
    fin = open(filename,"r")
    fin.readline()
    for line in fin:
        line = [item.rstrip() for item in line.rstrip().split(',')]
        if(line[2] in fastvec and line[2] in lvec and line[3] in fastvec and line[3] in lvec):
            affix_list.append(line[1])
            src_list.append(line[3])
            der_list.append(line[2])
    return affix_list,src_list,der_list

In [18]:
'''
Prepares training data and testing data 
'''
def format_input(filename,fastvec,lvec):
    affix_list,src_list,der_list = prepare_list(filename,fastvec,lvec)
    affix_dict = affix_vec(Q4List)
    f_trainx = np.array([[float(vec) for vec in fastvec[item]] for item in src_list])
    f_trainy = np.array([[float(vec) for vec in fastvec[item]] for item in der_list])
    traina = np.array([one_hot(affix,affix_dict) for affix in affix_list])
    f_trainx= np.hstack((f_trainx,traina))
    
    l_trainx = np.array([[float(vec) for vec in lvec[item]] for item in src_list])
    l_trainy = np.array([[float(vec) for vec in lvec[item]] for item in der_list])
    l_trainx= np.hstack((l_trainx,traina))
    
    return f_trainx,f_trainy,l_trainx,l_trainy



(8075, 254) (8075, 200) (8075, 404) (8075, 350)


In [7]:
'''
Prepares dict of words and their corresponding vector given in fastText_vectors.txt and vector_lazaridou.txt files
'''
def prepare_word_dict_l(filename = 'Q4/vector_lazaridou.txt'):
    dict_l ={}
    fin =open(filename,"r")
    for line in fin:
        line = line.replace('[',',')
        line = line.replace(']','')
        vec = line.strip().split(',')
        keyword = vec[0].strip()
        vec = [float(item.strip()) for item in vec[1:]]
        dict_l[keyword] = vec
    fin.close()
    return dict_l
    
def prepare_word_dict_f(filename = 'Q4/fastText_vectors.txt'):
    dict_f ={}
    fin =open(filename,"r")
    for line in fin:
        vec = line.strip().split()
        keyword = vec[0].strip()
        vec = [float(item.strip()) for item in vec[1:]]
        dict_f[keyword] = vec
    fin.close()
    return dict_f

In [29]:
RANDOM_SEED = 42
tf.set_random_seed(RANDOM_SEED)


def init_weights(shape):
    """ Weight initialization """
    weights = tf.random_normal(shape, stddev=0.1)
    return tf.Variable(weights)

def forwardprop(X, w_1, w_2):
    """
    Forward-propagation.
    IMPORTANT: yhat is not softmax since TensorFlow's softmax_cross_entropy_with_logits() does that internally.
    """
    h    = tf.nn.sigmoid(tf.matmul(X, w_1))  # The \sigma function
    yhat = tf.matmul(h, w_2)  # The \varphi function
    return yhat

def trainnet(t):
    fastvec = prepare_word_dict_f()
    lvec = prepare_word_dict_l()
    f_trainx,f_trainy,l_trainx,l_trainy = format_input(Q4List,fastvec,lvec)
    #print(f_trainx.shape,f_trainy.shape,l_trainx.shape,l_trainy.shape)
    
    pos = int(0.8*len(f_trainx))
    if(t ==0):
        train_X = f_trainx[:pos]
        train_Y = f_trainy[:pos]
        test_X = f_trainx[pos:]
        test_Y = f_trainy[pos:]
    else:
        train_X = l_trainx[:pos]
        train_Y = l_trainy[:pos]
        test_X = l_trainx[pos:]
        test_Y = l_trainy[pos:]
    #print(len(train_X),len(train_Y))
    
    # Layer's sizes
    x_size = train_X.shape[1]   # Number of input nodes: 4 features and 1 bias
    h_size = 30                # Number of hidden nodes
    y_size = train_y.shape[1]   # Number of outcomes (3 iris flowers)

    # Symbols
    X = tf.placeholder("float", shape=[None, x_size])
    y = tf.placeholder("float", shape=[None, y_size])

    # Weight initializations
    w_1 = init_weights((x_size, h_size))
    w_2 = init_weights((h_size, y_size))

    # Forward propagation
    yhat    = forwardprop(X, w_1, w_2)
    predict = yhat

    # Backward propagation
    cost    = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=yhat))
    updates = tf.train.GradientDescentOptimizer(0.01).minimize(cost)

    # Run SGD
    sess = tf.Session()
    init = tf.global_variables_initializer()
    sess.run(init)

    for epoch in range(5):
        # Train with each example
        for i in range(len(train_X)):
            sess.run(updates, feed_dict={X: train_X[i: i + 1], y: train_y[i: i + 1]})
    p = sess.run(predict, feed_dict={X: train_X, y: train_y})
        '''
        train_accuracy = np.mean(np.argmax(train_y, axis=1) ==
                                 sess.run(predict, feed_dict={X: train_X, y: train_y}))
        test_accuracy  = np.mean(np.argmax(test_y, axis=1) ==
                                 sess.run(predict, feed_dict={X: test_X, y: test_y}))
        '''
        print("Epoch = %d, train accuracy = %.2f%%, test accuracy = %.2f%%"
              % (epoch + 1, 100. * train_accuracy, 100. * test_accuracy))

    sess.close()
    return p



In [None]:
def derivedWOrdTask(inputFile = Q4List):
    print('hello world')
    affix_dict = affix_vec(Q4List)
    """
    Output vectors of 3 files:
    1)AnsFastText.txt - fastText vectors of derived words in wordList.csv
    2)AnsLzaridou.txt - Lazaridou vectors of the derived words in wordList.csv
    3)AnsModel.txt - Vectors for derived words as provided by the model
    
    For all the three files, each line should contain a derived word and its vector, exactly like 
    the format followed in "glove.6B.300d.txt"
    
    word<space>dim1<space>dim2........<space>dimN
    charitably 256.238 0.875 ...... 1.234
    
    """
    
    """
    The function should return 2 values
    1) Averaged cosine similarity between the corresponding words from output files 1 and 3, as well as 2 and 3.
    
        - if there are 3 derived words in wordList.csv, say word1, word2, word3
        then find the cosine similiryt between word1 in AnsFastText.txt and word1 in AnsModel.txt.
        - Repeat the same for word2 and word3.
        - Average the 3 cosine similarity values
        - DO the same for word1 to word3 between the files AnsLzaridou.txt and AnsModel.txt 
        and average the cosine simialities for valuse so obtained
        
    """
    return cosVal1,cosVal2
    

In [30]:
trainnet(0)

6460 6460


NameError: name 'train_X' is not defined