In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.options.mode.chained_assignment = None
from numpy import loadtxt
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
%matplotlib inline
import matplotlib
#matplotlib.use('agg')
matplotlib.style.use('ggplot')
import pickle as pkl
from matplotlib import pyplot as plt
from collections import Counter
from functools import reduce
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
import random
random.seed(1991)
import glob

In [2]:
novel_compounds=pd.read_csv("/data/dharp/compounding/datasets/novel_compounds.txt",delim_whitespace=True,header=None)
novel_compounds.columns=['modifier','head']
modifiers_list=novel_compounds['modifier'].unique()
heads_list=novel_compounds['head'].unique()

In [3]:
dfm_non_dec=pd.read_csv("/data/dharp/compounding/datasets/DFM_Non_Contextual_Non_Temporal.csv",sep='\t')
dfm_non_dec

Unnamed: 0,modifier,head,sim_bw_constituents,sim_with_head,sim_with_modifier,log_ratio,ppmi,local_mi
0,a_n,a_n,0.000,0.002,0.002,0.000,0.000,0.001
1,a_n,aa_n,0.000,0.991,0.000,0.000,0.198,0.001
2,a_n,aaa_n,0.000,0.837,0.001,0.000,0.217,0.001
3,a_n,aaaaa_n,0.002,1.000,0.002,0.000,0.406,0.001
4,a_n,adam_n,0.000,0.002,0.004,0.000,0.080,0.001
5,a_n,addison_n,0.000,0.901,0.000,0.000,0.232,0.001
6,a_n,administration_n,0.000,0.000,0.000,0.000,0.000,0.001
7,a_n,albert_n,0.000,0.000,0.001,0.000,0.000,0.001
8,a_n,american_n,0.000,0.010,0.002,0.000,0.028,0.001
9,a_n,anti_n,0.000,0.017,0.009,0.000,0.000,0.001


In [4]:
modifiers=dfm_non_dec.loc[dfm_non_dec.modifier.isin(modifiers_list)].drop(['head','sim_with_head'],axis=1)
modifier_features=modifiers.groupby(['modifier']).agg({'mean','std'})
modifier_features_1=modifier_features.columns.get_level_values(0)
modifier_features_2=modifier_features.columns.get_level_values(1)

cur_year=0
new_columns=[]
for year in modifier_features_1:
    new_columns.append("mod_"+str(year)+"_"+modifier_features_2[cur_year])
    cur_year+=1
modifier_features.columns=new_columns
modifier_features.fillna(modifier_features.mean(),inplace=True)
modifier_features.reset_index(inplace=True)
modifier_features

Unnamed: 0,modifier,mod_sim_bw_constituents_std,mod_sim_bw_constituents_mean,mod_sim_with_modifier_std,mod_sim_with_modifier_mean,mod_log_ratio_std,mod_log_ratio_mean,mod_ppmi_std,mod_ppmi_mean,mod_local_mi_std,mod_local_mi_mean
0,a_n,0.118,0.023,0.075,0.018,0.000,0.000,0.117,0.112,0.000,0.001
1,aaa_n,0.187,0.043,0.238,0.089,0.000,0.000,0.148,0.182,0.000,0.001
2,aarhus_n,0.091,0.052,0.407,0.281,0.000,0.000,0.231,0.332,0.000,0.001
3,abbreviated_n,0.003,0.001,0.006,0.005,0.000,0.000,0.182,0.331,0.000,0.001
4,abduction_n,0.050,0.020,0.075,0.099,0.000,0.000,0.150,0.339,0.000,0.001
5,abecedarian_n,0.001,0.001,0.690,0.509,0.000,0.000,0.004,0.364,0.000,0.001
6,ability_n,0.145,0.062,0.120,0.066,0.000,0.000,0.108,0.142,0.000,0.001
7,abolition_n,0.062,0.036,0.052,0.034,0.000,0.000,0.121,0.174,0.000,0.001
8,aboriginal_n,0.034,0.012,0.051,0.048,0.000,0.000,0.118,0.162,0.000,0.001
9,abortion_n,0.278,0.178,0.188,0.134,0.000,0.000,0.114,0.131,0.000,0.001


In [5]:
heads=dfm_non_dec.loc[dfm_non_dec['head'].isin(heads_list)].drop(['modifier','sim_with_modifier'],axis=1)
head_features=heads.groupby(['head']).agg({'mean','std'})
head_features_1=head_features.columns.get_level_values(0)
head_features_2=head_features.columns.get_level_values(1)

cur_year=0
new_columns=[]
for year in head_features_1:
    new_columns.append("head_"+str(year)+"_"+head_features_2[cur_year])
    cur_year+=1
head_features.columns=new_columns
head_features.fillna(head_features.mean(),inplace=True)
head_features.reset_index(inplace=True)
head_features

Unnamed: 0,head,head_sim_bw_constituents_std,head_sim_bw_constituents_mean,head_sim_with_head_std,head_sim_with_head_mean,head_log_ratio_std,head_log_ratio_mean,head_ppmi_std,head_ppmi_mean,head_local_mi_std,head_local_mi_mean
0,a_n,0.211,0.079,0.117,0.041,0.000,0.000,0.102,0.087,0.000,0.001
1,aaron_n,0.064,0.030,0.203,0.164,0.000,0.000,0.133,0.281,0.000,0.001
2,ab_n,0.262,0.214,0.256,0.218,0.000,0.000,0.110,0.156,0.000,0.001
3,abandonment_n,0.051,0.018,0.115,0.058,0.000,0.000,0.140,0.272,0.000,0.001
4,abbreviation_n,0.175,0.072,0.219,0.148,0.000,0.000,0.094,0.252,0.000,0.001
5,abc_n,0.167,0.087,0.241,0.141,0.000,0.000,0.151,0.172,0.000,0.001
6,abduction_n,0.043,0.015,0.244,0.169,0.000,0.000,0.091,0.299,0.000,0.001
7,abdul_n,0.069,0.040,0.247,0.235,0.000,0.000,0.147,0.268,0.000,0.001
8,abdullah_n,0.080,0.043,0.109,0.096,0.000,0.000,0.171,0.212,0.000,0.001
9,ability_n,0.104,0.033,0.091,0.047,0.000,0.000,0.122,0.169,0.000,0.001


## Positive Set

In [6]:
positive_df=pd.merge(novel_compounds,head_features,on=["head"])
positive_df=pd.merge(positive_df,modifier_features,on=["modifier"])
positive_df['Plausibility']=True
positive_df

Unnamed: 0,modifier,head,head_sim_bw_constituents_std,head_sim_bw_constituents_mean,head_sim_with_head_std,head_sim_with_head_mean,head_log_ratio_std,head_log_ratio_mean,head_ppmi_std,head_ppmi_mean,...,mod_sim_bw_constituents_mean,mod_sim_with_modifier_std,mod_sim_with_modifier_mean,mod_log_ratio_std,mod_log_ratio_mean,mod_ppmi_std,mod_ppmi_mean,mod_local_mi_std,mod_local_mi_mean,Plausibility
0,a_n,peaceful_n,0.008,0.005,0.118,0.118,0.000,0.000,0.253,0.390,...,0.023,0.075,0.018,0.000,0.000,0.117,0.112,0.000,0.001,True
1,a_n,robinson_n,0.073,0.035,0.143,0.082,0.000,0.000,0.130,0.223,...,0.023,0.075,0.018,0.000,0.000,0.117,0.112,0.000,0.001,True
2,a_n,supply_n,0.153,0.077,0.148,0.093,0.001,0.000,0.075,0.046,...,0.023,0.075,0.018,0.000,0.000,0.117,0.112,0.000,0.001,True
3,india_n,peaceful_n,0.008,0.005,0.118,0.118,0.000,0.000,0.253,0.390,...,0.054,0.150,0.127,0.001,0.000,0.088,0.055,0.001,0.001,True
4,india_n,standard_n,0.136,0.063,0.114,0.062,0.000,0.000,0.088,0.066,...,0.054,0.150,0.127,0.001,0.000,0.088,0.055,0.001,0.001,True
5,india_n,award_n,0.135,0.043,0.096,0.054,0.000,0.000,0.135,0.154,...,0.054,0.150,0.127,0.001,0.000,0.088,0.055,0.001,0.001,True
6,india_n,joint_n,0.111,0.052,0.184,0.138,0.000,0.000,0.130,0.129,...,0.054,0.150,0.127,0.001,0.000,0.088,0.055,0.001,0.001,True
7,india_n,companion_n,0.233,0.122,0.255,0.141,0.000,0.000,0.137,0.166,...,0.054,0.150,0.127,0.001,0.000,0.088,0.055,0.001,0.001,True
8,india_n,foundation_n,0.092,0.024,0.073,0.029,0.000,0.000,0.111,0.104,...,0.054,0.150,0.127,0.001,0.000,0.088,0.055,0.001,0.001,True
9,india_n,gentleman_n,0.176,0.089,0.213,0.144,0.000,0.000,0.132,0.153,...,0.054,0.150,0.127,0.001,0.000,0.088,0.055,0.001,0.001,True


## Negative Sets

In [7]:
def neg_df_creator(file):
    pkl_file=pkl.load( open(file,'rb'))
    df=pd.DataFrame(pkl_file)
    df.columns=['modifier','head']
    negative_df=pd.merge(df,head_features,on=["head"])
    negative_df=pd.merge(negative_df,modifier_features,on=["modifier"])
    negative_df['Plausibility']=False
    return negative_df

In [8]:
def df_joiner(files):
    df_list=[]
    for file in files:
        neg_df=neg_df_creator(file)
        whole_df=pd.concat([neg_df,positive_df])
        df_list.append(whole_df)
    
    return df_list

In [9]:
corrupt_modifier_files=[]
for file in glob.glob("/data/dharp/compounding/datasets/corrupt_modifier*"):
    corrupt_modifier_files.append(file)
corrupt_modifiers=df_joiner(corrupt_modifier_files)

In [10]:
corrupt_head_files=[]
for file in glob.glob("/data/dharp/compounding/datasets/corrupt_head*"):
    corrupt_head_files.append(file)
corrupt_heads=df_joiner(corrupt_head_files)

In [11]:
acc_ch=[]
for i,corrupt_head in enumerate(corrupt_heads):
    data=corrupt_head.drop(['modifier','head'],axis=1)
    X, Y = data.iloc[:,:-1],data.iloc[:,-1]
    
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1991)
    # fit model no training data
    model = XGBClassifier(njobs=-1)
    model.fit(X_train, y_train)
    # make predictions for test data
    y_pred = model.predict(X_test)
    predictions = [round(value) for value in y_pred]
    # evaluate predictions
    print("DF",i+1)
    accuracy = accuracy_score(y_test, predictions)
    acc_ch.append(accuracy)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))


DF 1
Accuracy: 69.05%
DF 2
Accuracy: 69.90%
DF 3
Accuracy: 69.44%
DF 4
Accuracy: 69.38%
DF 5
Accuracy: 69.68%
DF 6
Accuracy: 70.08%
DF 7
Accuracy: 69.87%
DF 8
Accuracy: 69.93%
DF 9
Accuracy: 70.26%
DF 10
Accuracy: 69.14%


In [12]:
round(np.mean(acc_ch)*100,2)

69.67

In [13]:
round(np.std(acc_ch)*100,2)

0.39

In [14]:
acc_cm=[]
for i,corrupt_head in enumerate(corrupt_modifiers):
    data=corrupt_head.drop(['modifier','head'],axis=1)
    X, Y = data.iloc[:,:-1],data.iloc[:,-1]
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1991)
# fit model no training data
    model = XGBClassifier(njobs=-1)
    model.fit(X_train, y_train)
# make predictions for test data
    y_pred = model.predict(X_test)
    predictions = [round(value) for value in y_pred]
# evaluate predictions
    accuracy = accuracy_score(y_test, predictions)
    acc_cm.append(accuracy)
    print("DF",i+1)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))

DF 1
Accuracy: 67.17%
DF 2
Accuracy: 67.50%
DF 3
Accuracy: 67.04%
DF 4
Accuracy: 67.52%
DF 5
Accuracy: 67.86%
DF 6
Accuracy: 67.63%
DF 7
Accuracy: 67.11%
DF 8
Accuracy: 67.12%
DF 9
Accuracy: 67.29%
DF 10
Accuracy: 67.05%


In [15]:
round(np.mean(acc_cm)*100,2)

67.33

In [16]:
round(np.std(acc_cm)*100,2)

0.27