Notebook dealing with the final preprocessing steps for the neural networks.


In [3]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torch.utils.data
import pandas as pd
from torch.utils import data
from numpy import array
from numpy import argmax
import argparse
from torch.autograd import Variable
from torch import optim
import numpy as np
import os
from sklearn.model_selection import train_test_split
import logging
import pickle as pkl
import warnings
pd.options.mode.chained_assignment = None
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="1"
import glob
import random
random.seed(1991)
#torch.set_default_tensor_type('torch.cuda.DoubleTensor')

In [4]:
torch.manual_seed(1991)
if not torch.cuda.is_available():
    
        print("WARNING: You have a CUDA device, so you should probably run with --cuda")

device = torch.device("cuda")

In [5]:
novel_compounds_list = pkl.load( open( "/data/dharp/compounding/datasets/novel_compounds_list.pkl", "rb" ) )
m, h = zip(*novel_compounds_list)
heads_list=list(set(h))
modifiers_list=list(set(m))

In [6]:
constituents=pd.read_pickle("/data/dharp/compounding/datasets/constituents_CompoundAgnostic_DecadeAgnostic_300.pkl")
constituents.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
common,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a_n,0.946819,-0.21591,-0.078663,0.098186,0.031344,-0.128517,-0.040655,-0.05266,0.022259,0.114906,...,-5.2e-05,-6e-06,3.9e-05,-4e-05,9.2e-05,1.9e-05,4e-06,1e-05,6.1e-05,7e-05
aa_n,0.802237,-0.200831,-0.027173,0.068958,-0.026429,-0.094564,0.034699,0.020185,0.017012,-0.060455,...,-0.014317,-0.006173,0.104329,0.05988,-0.047576,0.02188,0.034428,-0.081614,-0.009093,-0.025753
aaa_n,0.538223,-0.113592,0.006146,0.05401,-0.015412,-0.045805,0.012455,0.015334,-0.008608,0.024122,...,0.019392,0.01116,0.117384,0.046802,0.023796,0.016095,0.057174,-0.077713,0.033013,-0.003302
aaaa_n,0.006426,0.002476,0.000166,-0.011107,0.032347,-0.008947,-0.01136,-0.002168,0.004395,-0.008244,...,-0.064327,-0.01799,0.271208,0.101174,-0.167501,0.13928,0.11452,-0.188355,0.010231,-0.082619
aaaaa_n,0.002501,-0.002978,-0.001491,-0.000103,0.01968,-0.011237,-0.007322,-0.002661,0.00479,-0.000987,...,-0.08108,-0.024664,0.289407,0.111249,-0.173599,0.167372,0.120406,-0.197147,0.019597,-0.085016


In [7]:
heads=constituents.loc[constituents.index.isin(heads_list)]
heads.index.names=['head']
heads.info()
heads.head()

<class 'pandas.core.frame.DataFrame'>
Index: 7619 entries, a_n to zwingli_n
Columns: 300 entries, 0 to 299
dtypes: float64(300)
memory usage: 17.5+ MB


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
head,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a_n,0.946819,-0.21591,-0.078663,0.098186,0.031344,-0.128517,-0.040655,-0.05266,0.022259,0.114906,...,-5.2e-05,-6e-06,3.9e-05,-4e-05,9.2e-05,1.9e-05,4e-06,1e-05,6.1e-05,7e-05
aaron_n,0.632065,-0.157938,-0.028958,0.063066,-0.003585,-0.006186,0.037621,0.056535,-0.007278,-0.009055,...,-0.030316,0.066355,-0.032037,-0.06677,-0.079918,-0.041611,-0.02712,-0.032024,0.004755,-0.064226
ab_n,0.814786,-0.224023,-0.051034,0.065876,0.0048,-0.142471,0.05635,0.04805,-0.043411,-0.136472,...,0.039908,0.015763,0.01081,0.025355,-0.022604,-0.00605,-0.02811,-0.031565,-0.053186,-0.058053
abandonment_n,0.870733,-0.149542,-0.024366,0.037407,3.8e-05,-0.025059,0.029089,0.02773,-0.010006,-0.068209,...,-0.043914,-0.008754,0.037302,0.01737,0.006246,0.011758,0.018942,0.001336,-0.048321,-0.024098
abbreviation_n,0.679414,-0.186188,-0.05021,0.102159,-0.030931,-0.10505,0.053795,0.036983,-0.03468,-0.069881,...,0.017008,0.006853,-0.0494,0.007128,-0.004563,0.013209,-0.002722,-0.015055,0.007906,0.009347


In [8]:
modifiers=constituents.loc[constituents.index.isin(modifiers_list)]
modifiers.index.names=['modifier']
modifiers.info()
modifiers.head()

<class 'pandas.core.frame.DataFrame'>
Index: 7901 entries, a_n to zuni_n
Columns: 300 entries, 0 to 299
dtypes: float64(300)
memory usage: 18.1+ MB


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
modifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a_n,0.946819,-0.21591,-0.078663,0.098186,0.031344,-0.128517,-0.040655,-0.05266,0.022259,0.114906,...,-5.2e-05,-6e-06,3.9e-05,-4e-05,9.2e-05,1.9e-05,4e-06,1e-05,6.1e-05,7e-05
aaa_n,0.538223,-0.113592,0.006146,0.05401,-0.015412,-0.045805,0.012455,0.015334,-0.008608,0.024122,...,0.019392,0.01116,0.117384,0.046802,0.023796,0.016095,0.057174,-0.077713,0.033013,-0.003302
aarhus_n,0.031769,0.051368,0.015322,-0.057539,0.106346,0.017514,0.002858,-0.010991,0.008487,-0.018179,...,-0.015531,0.006199,0.009077,-0.007065,0.012976,0.012604,0.003189,0.014277,0.01472,-0.001458
abbreviated_n,0.708724,-0.158414,-0.037277,-0.021225,0.008293,-0.138097,0.047325,0.025533,-0.034397,-0.090336,...,-0.00502,-0.003305,0.060561,-0.017757,-0.00027,0.071692,0.027068,-0.043158,-0.004612,-0.025402
abduction_n,0.510707,-0.074578,-0.034553,-0.048555,0.049102,-0.010484,-0.032769,-0.006869,0.046021,-0.029008,...,-0.099285,0.070637,0.035214,0.000986,0.045005,-0.091899,0.032369,0.035032,0.044302,-0.060663


In [9]:
novel_compounds=pd.DataFrame(novel_compounds_list)
novel_compounds.columns=['modifier','head']

In [10]:
positive_df=pd.merge(novel_compounds,heads.reset_index(),on=["head"])
positive_df=pd.merge(positive_df,modifiers.reset_index(),on=["modifier"])
#positive_df['Plausibility']=True
positive_df.set_index(['modifier','head'],inplace=True)
positive_df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 25920 entries, (a_n, peaceful_n) to (xxviii_n, olympiad_n)
Columns: 600 entries, 0_x to 299_y
dtypes: float64(600)
memory usage: 118.9+ MB


In [11]:
head_cols=positive_df.columns[:300]
positive_heads=positive_df[head_cols]
positive_heads.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 25920 entries, (a_n, peaceful_n) to (xxviii_n, olympiad_n)
Columns: 300 entries, 0_x to 299_x
dtypes: float64(300)
memory usage: 59.5+ MB


In [12]:
modifier_cols=positive_df.columns[300:]
positive_modifiers=positive_df[modifier_cols]
positive_modifiers.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 25920 entries, (a_n, peaceful_n) to (xxviii_n, olympiad_n)
Columns: 300 entries, 0_y to 299_y
dtypes: float64(300)
memory usage: 59.5+ MB


In [13]:
positive_heads_tensor = torch.tensor(positive_heads.values)
positive_heads_tensor.shape

torch.Size([25920, 300])

In [14]:
positive_modifiers_tensor = torch.tensor(positive_modifiers.values)
positive_modifiers_tensor.shape

torch.Size([25920, 300])

In [15]:
positive_Y=torch.ones(positive_modifiers_tensor.shape[0])
positive_Y.shape

torch.Size([25920])

In [16]:
positive_class=torch.cat((positive_modifiers_tensor, positive_heads_tensor), 1)
positive_class.shape

torch.Size([25920, 600])

In [17]:
def neg_df_creator(file):
    pkl_file=pkl.load( open(file,'rb'))
    df=pd.DataFrame(pkl_file)
    
    df.columns=['modifier','head']
    
    negative_df=pd.merge(df,heads.reset_index(),on=["head"])
    negative_df=pd.merge(negative_df,modifiers.reset_index(),on=["modifier"])

    negative_df.set_index(['modifier','head'],inplace=True)
    shape_neg=negative_df.shape[1]
    head_cols=negative_df.columns[:shape_neg]
    negative_heads=negative_df[head_cols]
    
    modifier_cols=negative_df.columns[shape_neg:]
    negative_modifiers=negative_df[modifier_cols]
    
    negative_heads_tensor = torch.tensor(negative_heads.values)
    negative_modifiers_tensor = torch.tensor(negative_modifiers.values)

    negative_Y=torch.zeros(negative_modifiers_tensor.shape[0])
    negative_class=torch.cat((negative_modifiers_tensor, negative_heads_tensor), 1)

    return negative_class,negative_Y

In [18]:
def tensor_joiner(files):
    tensor_list=[]
    for file in files:
        negative_class,negative_Y=neg_df_creator(file)
        X=torch.cat((positive_class, negative_class), 0)
        Y=torch.cat((positive_Y,negative_Y),0)
        tensor_list.append([X,Y])
    
    return tensor_list

In [19]:
corrupt_modifier_files=[]
for file in glob.glob("/data/dharp/compounding/datasets/corrupt_modifier*"):
    corrupt_modifier_files.append(file)
corrupt_modifiers=tensor_joiner(corrupt_modifier_files)

In [20]:
corrupt_head_files=[]
for file in glob.glob("/data/dharp/compounding/datasets/corrupt_head*"):
    corrupt_head_files.append(file)
corrupt_heads=tensor_joiner(corrupt_head_files)

In [21]:
input_size = positive_class.shape[1]
hidden_size = 300
num_classes = 2
num_epochs = 50
batch_size = 100
learning_rate = 0.001

In [22]:
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size) 
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes,bias=False)  
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

model = NeuralNet(input_size, hidden_size, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) 

In [23]:
def looper(datasets):
    total_accuracy=[]
    
    for dataset in datasets:
        X=dataset[0]
        Y=dataset[1]
        n = len(X)  # how many total elements you have
        n_test = int( n * .2 )  # number of test/val elements
        n_train = n - n_test
        idx = list(range(n))  # indices to all elements
        random.shuffle(idx)  # in-place shuffle the indices to facilitate random splitting
        train_idx = idx[:n_train]
        test_idx = idx[n_train:]
        trX=X[train_idx].float().to(device)
        teX=X[test_idx].float().to(device)

        trY=Y[train_idx].long().to(device)
        teY=Y[test_idx].long().to(device)
        model = NeuralNet(input_size, hidden_size, num_classes).to(device)
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
        n_examples=trX.shape[0]
        for i in range(num_epochs):

            cost = 0.

            num_batches = n_examples // batch_size
            for k in range(num_batches):
                start, end = k * batch_size, (k + 1) * batch_size
                outputs = model(trX[start:end])
                loss = criterion(outputs, trY[start:end])

                optimizer.zero_grad()
                loss.backward()
                
                optimizer.step()

            #if (k+1) % 100 == 0:
            #print ('Epoch [{}/{}], Loss: {:.4f}'.format(i+1, num_epochs, loss.item()))

        with torch.no_grad():
            correct=0
            total=0
            outputs = model(teX)
            _, predicted = torch.max(outputs.data, 1)
            total += teY.size(0)
            correct += (predicted == teY).sum().item()
        curr_acc=100 * correct / total
        print(curr_acc)
        total_accuracy.append(curr_acc)
    return total_accuracy
        

In [24]:
cor_head_acc=looper(corrupt_heads)

72.68255040030867
72.67290440821839
71.69865920709945
73.02980611555898
69.93344265457702
72.65361242403782
73.42529179126073
72.31600270087779
72.47998456641265
72.89476222629497


65.83389601620527
65.03327867271149
65.27442847496864
63.66354779589081
64.70531494164175
65.39018038005209
64.8114208546349
66.05575383428186
64.46416513938459
64.99469470435034

In [25]:
round(np.mean(cor_head_acc),2)

72.38

In [26]:
round(np.std(cor_head_acc),2)

0.92

In [27]:
cor_mod_acc=looper(corrupt_modifiers)

74.87244897959184
75.9386671908787
74.231260438157
75.03927729772192
75.57341697706332
74.61094254673583
75.02452422993917
75.67753338570306
74.09041875061293
75.16936671575847


61.92111459968603
63.111853744839784
61.80371352785146
62.81421838177533
63.99725544010978
62.00450230008809
62.016872670198154
62.5
63.13621653427479
62.837506136475206

In [28]:
round(np.mean(cor_mod_acc),2)

75.02

In [29]:
round(np.std(cor_mod_acc),2)

0.57