Notebook dealing with the final preprocessing steps for the neural networks.


In [1]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torch.utils.data
import pandas as pd
from torch.utils import data
from numpy import array
from numpy import argmax
import argparse
from torch.autograd import Variable
from torch import optim
import numpy as np
import os
from sklearn.model_selection import train_test_split
import logging
import pickle as pkl
import warnings
pd.options.mode.chained_assignment = None
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="1"
import glob
import random
random.seed(1991)
#torch.set_default_tensor_type('torch.cuda.DoubleTensor')

In [2]:
torch.manual_seed(1991)
if not torch.cuda.is_available():
    
        print("WARNING: You have a CUDA device, so you should probably run with --cuda")

device = torch.device("cuda")

In [3]:
novel_compounds_list = pkl.load( open( "/data/dharp/compounding/datasets/novel_compounds_list.pkl", "rb" ) )
m, h = zip(*novel_compounds_list)
heads_list=list(set(h))
modifiers_list=list(set(m))

In [4]:
constituents=pd.read_pickle("/data/dharp/compounding/datasets/constituents_CompoundAgnostic_DecadeCentric_300.pkl")
constituents.index.names=['constituent','decade']
constituents.reset_index(inplace=True)
constituents.info()
constituents=constituents.drop(['decade'],axis=1).groupby(['constituent']).mean()
constituents

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1042962 entries, 0 to 1042961
Columns: 302 entries, constituent to 299
dtypes: float64(300), int64(1), object(1)
memory usage: 2.3+ GB


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
constituent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a_n,0.934070,-0.206074,-0.094469,0.132060,0.027215,-0.108181,0.004351,-0.036574,0.101759,0.006115,...,0.003794,-0.000669,0.000357,-0.001013,-0.000677,0.001197,-0.000380,-0.002248,0.000191,0.001036
aa_n,0.514024,-0.122839,-0.024151,0.014367,-0.028287,-0.034887,0.057167,0.036987,-0.087334,0.025848,...,0.033355,-0.002466,0.046454,0.019335,-0.045772,0.007672,0.071543,-0.075893,0.046748,0.073522
aaa_n,0.307183,-0.073783,0.006075,0.038791,0.019985,0.003164,0.001250,0.018270,-0.046140,0.016035,...,0.070460,-0.046085,0.032522,-0.004979,-0.089520,0.048931,0.045879,-0.136788,0.038309,0.106437
aaaa_n,0.005116,-0.001246,-0.001077,-0.006884,0.029136,-0.016154,-0.003653,0.000107,-0.005070,0.006983,...,0.129152,-0.060062,0.083847,-0.005262,-0.143907,0.064368,0.076728,-0.224940,0.092423,0.169547
aaaaa_n,0.003209,-0.003896,-0.001936,-0.000660,0.022805,-0.016628,-0.001137,-0.000424,-0.000584,0.007936,...,0.129997,-0.056457,0.083899,-0.006093,-0.151344,0.066702,0.074083,-0.226053,0.096750,0.181389
aaai_n,0.058328,0.052127,0.017591,-0.128646,-0.003524,0.008867,-0.026797,-0.011245,0.014027,-0.006052,...,-0.013529,-0.100185,-0.011484,0.008398,-0.144398,0.037981,-0.047511,0.116991,0.009479,0.013338
aab_n,0.623938,-0.190497,-0.048996,0.084083,-0.036512,-0.122264,0.119752,0.052350,-0.107889,0.031893,...,0.008701,0.011902,0.013396,-0.010790,0.039814,0.019485,-0.025332,0.016637,-0.000867,-0.030976
aac_n,0.755391,-0.205704,-0.049148,0.063934,-0.073319,-0.109713,0.136875,0.066215,-0.130467,0.025048,...,-0.020570,-0.028949,0.046229,0.009654,0.000480,-0.032832,0.061732,-0.091646,0.036139,-0.038949
aaddison_n,0.009094,0.005021,0.000701,-0.021291,0.025256,-0.009836,-0.008737,-0.002332,-0.007740,0.002507,...,0.053356,-0.052932,0.066102,-0.081532,-0.078244,-0.059156,0.080233,-0.002835,0.050700,-0.001261
aaf_n,0.061113,0.053747,0.016583,-0.131646,0.027088,0.059114,-0.023257,0.002065,-0.031703,-0.023231,...,0.012534,-0.009030,-0.027312,-0.048331,0.020825,0.002814,-0.044882,-0.072378,0.021416,-0.068954


In [5]:
heads=constituents.loc[constituents.index.isin(heads_list)]
heads.index.names=['head']
heads.info()
heads.head()

<class 'pandas.core.frame.DataFrame'>
Index: 7619 entries, a_n to zwingli_n
Columns: 300 entries, 0 to 299
dtypes: float64(300)
memory usage: 17.5+ MB


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
head,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a_n,0.93407,-0.206074,-0.094469,0.13206,0.027215,-0.108181,0.004351,-0.036574,0.101759,0.006115,...,0.003794,-0.000669,0.000357,-0.001013,-0.000677,0.001197,-0.00038,-0.002248,0.000191,0.001036
aaron_n,0.692965,-0.187733,-0.040114,0.097325,-0.013611,0.015357,0.047002,0.060145,-0.010663,-0.010431,...,0.00057,0.004532,0.023021,-0.006347,0.028784,0.040798,-0.026946,0.066203,-0.033885,-0.008613
ab_n,0.820441,-0.239632,-0.060073,0.094867,-0.025801,-0.111078,0.117711,0.06244,-0.144626,0.033497,...,0.014204,-0.029136,-0.029751,-0.006354,-0.036222,-0.005165,0.003839,0.011766,-0.054634,0.012014
abandonment_n,0.830606,-0.149441,-0.042713,0.049928,-0.008147,-0.008204,0.040755,0.027962,-0.052945,0.000655,...,-0.024129,0.018861,0.044675,-0.029112,-0.006222,0.044334,-0.008556,0.019207,-0.000174,-0.007981
abbreviation_n,0.73942,-0.216979,-0.055873,0.118039,-0.040882,-0.081541,0.086102,0.041131,-0.081158,0.044117,...,0.005833,0.021772,-0.01922,-0.004647,0.010557,0.034357,-0.016216,-0.011845,0.010076,-0.009522


In [6]:
modifiers=constituents.loc[constituents.index.isin(modifiers_list)]
modifiers.index.names=['modifier']
modifiers.info()
modifiers.head()

<class 'pandas.core.frame.DataFrame'>
Index: 7901 entries, a_n to zuni_n
Columns: 300 entries, 0 to 299
dtypes: float64(300)
memory usage: 18.1+ MB


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
modifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a_n,0.93407,-0.206074,-0.094469,0.13206,0.027215,-0.108181,0.004351,-0.036574,0.101759,0.006115,...,0.003794,-0.000669,0.000357,-0.001013,-0.000677,0.001197,-0.00038,-0.002248,0.000191,0.001036
aaa_n,0.307183,-0.073783,0.006075,0.038791,0.019985,0.003164,0.00125,0.01827,-0.04614,0.016035,...,0.07046,-0.046085,0.032522,-0.004979,-0.08952,0.048931,0.045879,-0.136788,0.038309,0.106437
aarhus_n,0.052376,0.184254,-0.059967,-0.001672,0.108132,0.025534,0.031214,-0.009343,-0.003059,-0.00392,...,-0.005545,-0.015922,0.003373,0.002559,-0.015542,-0.011126,0.001885,0.002987,0.005774,-0.007526
abbreviated_n,0.606597,-0.136342,-0.035223,-0.027291,-0.008463,-0.107055,0.078737,0.016746,-0.061991,0.020858,...,0.034637,0.030939,-0.044476,-0.069989,0.042256,0.006401,0.019639,-0.034508,-0.040093,0.126693
abduction_n,0.416726,-0.013394,-0.054859,-0.030775,0.033412,-0.018789,-0.018904,-0.006397,0.006432,0.037732,...,0.008406,0.0102,0.023553,0.028624,0.030171,-0.038808,0.000347,-0.000422,0.036431,-0.004485


In [7]:
novel_compounds=pd.DataFrame(novel_compounds_list)
novel_compounds.columns=['modifier','head']

In [8]:
positive_df=pd.merge(novel_compounds,heads.reset_index(),on=["head"])
positive_df=pd.merge(positive_df,modifiers.reset_index(),on=["modifier"])
#positive_df['Plausibility']=True
positive_df.set_index(['modifier','head'],inplace=True)
positive_df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 25920 entries, (a_n, peaceful_n) to (xxviii_n, olympiad_n)
Columns: 600 entries, 0_x to 299_y
dtypes: float64(600)
memory usage: 118.9+ MB


In [9]:
head_cols=positive_df.columns[:600//2]
positive_heads=positive_df[head_cols]
positive_heads.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 25920 entries, (a_n, peaceful_n) to (xxviii_n, olympiad_n)
Columns: 300 entries, 0_x to 299_x
dtypes: float64(300)
memory usage: 59.5+ MB


In [10]:
modifier_cols=positive_df.columns[600//2:]
positive_modifiers=positive_df[modifier_cols]
positive_modifiers.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 25920 entries, (a_n, peaceful_n) to (xxviii_n, olympiad_n)
Columns: 300 entries, 0_y to 299_y
dtypes: float64(300)
memory usage: 59.5+ MB


In [11]:
positive_heads_tensor = torch.tensor(positive_heads.values)
positive_heads_tensor.shape

torch.Size([25920, 300])

In [12]:
positive_modifiers_tensor = torch.tensor(positive_modifiers.values)
positive_modifiers_tensor.shape

torch.Size([25920, 300])

In [13]:
positive_Y=torch.ones(positive_modifiers_tensor.shape[0])
positive_Y.shape

torch.Size([25920])

In [14]:
positive_class=torch.cat((positive_modifiers_tensor, positive_heads_tensor), 1)
positive_class.shape

torch.Size([25920, 600])

In [15]:
def neg_df_creator(file):
    pkl_file=pkl.load( open(file,'rb'))
    df=pd.DataFrame(pkl_file)
    
    df.columns=['modifier','head']
    
    negative_df=pd.merge(df,heads.reset_index(),on=["head"])
    negative_df=pd.merge(negative_df,modifiers.reset_index(),on=["modifier"])

    negative_df.set_index(['modifier','head'],inplace=True)
    shape_neg=negative_df.shape[1]
    head_cols=negative_df.columns[:shape_neg]
    negative_heads=negative_df[head_cols]
    
    modifier_cols=negative_df.columns[shape_neg:]
    negative_modifiers=negative_df[modifier_cols]
    
    negative_heads_tensor = torch.tensor(negative_heads.values)
    negative_modifiers_tensor = torch.tensor(negative_modifiers.values)

    negative_Y=torch.zeros(negative_modifiers_tensor.shape[0])
    negative_class=torch.cat((negative_modifiers_tensor, negative_heads_tensor), 1)

    return negative_class,negative_Y

In [16]:
def tensor_joiner(files):
    tensor_list=[]
    for file in files:
        negative_class,negative_Y=neg_df_creator(file)
        X=torch.cat((positive_class, negative_class), 0)
        Y=torch.cat((positive_Y,negative_Y),0)
        tensor_list.append([X,Y])
    
    return tensor_list

In [17]:
corrupt_modifier_files=[]
for file in glob.glob("/data/dharp/compounding/datasets/corrupt_modifier*"):
    corrupt_modifier_files.append(file)
corrupt_modifiers=tensor_joiner(corrupt_modifier_files)

In [18]:
corrupt_head_files=[]
for file in glob.glob("/data/dharp/compounding/datasets/corrupt_head*"):
    corrupt_head_files.append(file)
corrupt_heads=tensor_joiner(corrupt_head_files)

In [20]:
corrupt_heads[0][0].shape

torch.Size([51836, 600])

In [27]:
input_size = positive_class.shape[1]
hidden_size = 300
num_classes = 2
num_epochs = 50
batch_size = 100
learning_rate = 0.001

In [28]:
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size) 
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes,bias=False)  
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

model = NeuralNet(input_size, hidden_size, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) 

In [29]:
def looper(datasets):
    total_accuracy=[]
    
    for dataset in datasets:
        X=dataset[0]
        Y=dataset[1]
        n = len(X)  # how many total elements you have
        n_test = int( n * .2 )  # number of test/val elements
        n_train = n - n_test
        idx = list(range(n))  # indices to all elements
        random.shuffle(idx)  # in-place shuffle the indices to facilitate random splitting
        train_idx = idx[:n_train]
        test_idx = idx[n_train:]
        trX=X[train_idx].float().to(device)
        teX=X[test_idx].float().to(device)

        trY=Y[train_idx].long().to(device)
        teY=Y[test_idx].long().to(device)
        model = NeuralNet(input_size, hidden_size, num_classes).to(device)
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
        n_examples=trX.shape[0]
        for i in range(num_epochs):

            cost = 0.

            num_batches = n_examples // batch_size
            for k in range(num_batches):
                start, end = k * batch_size, (k + 1) * batch_size
                outputs = model(trX[start:end])
                loss = criterion(outputs, trY[start:end])

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            #if (k+1) % 100 == 0:
            #print ('Epoch [{}/{}], Loss: {:.4f}'.format(i+1, num_epochs, loss.item()))

        with torch.no_grad():
            correct=0
            total=0
            outputs = model(teX)
            _, predicted = torch.max(outputs.data, 1)
            total += teY.size(0)
            correct += (predicted == teY).sum().item()
        curr_acc=100 * correct / total
        print(curr_acc)
        total_accuracy.append(curr_acc)
    return total_accuracy
        

In [30]:
cor_head_acc=looper(corrupt_heads)

72.63432043985723
72.63432043985723
72.91405421047554
71.0909617054114
73.31918587826758
72.54750651104466
71.65042924664802
72.43175460596122
71.66972123082859
72.3063567087875


65.49628629304524
64.41593517893315
64.70531494164175
65.39982637214237
65.63133018230926
65.19726053824635
65.6699141506704
65.06221664898234
65.61203819812867
66.13292177100415

In [31]:
round(np.mean(cor_head_acc),2)

72.32

In [32]:
round(np.std(cor_head_acc),2)

0.63

In [33]:
cor_mod_acc=looper(corrupt_modifiers)

75.02943485086342
74.83782189895813
74.68317123489537
74.4304791830322
74.01489903940404
74.49349124009005
75.06376299784186
73.13432835820896
74.12964597430617
74.94354442808051


63.039638932496075
63.99646156870454
62.59946949602122
63.31500392772977
63.271907469123704
63.71733385533914
62.18363743378458
59.72113118617439
62.94008041580857
62.98478154148257

In [34]:
round(np.mean(cor_mod_acc),2)

74.48

In [35]:
round(np.std(cor_mod_acc),2)

0.56