Constructing Features from Scratch

In [30]:
#import libraries

import os
import sys
import numpy as np
import torchtext
import pandas as pd
import re
import requests
import pickle
import yake
import nltk
from nltk.corpus import wordnet as wn
nltk.download('popular')

from lib import data_utils, preprocess

#get path
src_path = os.getcwd()

datapath = src_path + '/Data/'
PLcopath = datapath + 'AnnotatedPLData/PLCoref'
PLpath = datapath + 'AnnotatedPLData/PLTexts'
CENcopath = datapath + 'AnnotatedCENData/CENCoref'
CENpath = datapath + 'AnnotatedCENData/CENTexts'
ONpath = datapath + 'AnnotatedONData/ONCoref'

PLInter = src_path + '/output/IntermediateFilesPL/'
CENInter = src_path + '/output/IntermediateFilesCEN/'
ONInter = src_path + '/output/IntermediateFilesON/'

#import data

#create empty data frame
PL_data = pd.DataFrame(columns=['corpusID', 'character', 'animacy', 'coref_chain', 'chain_head', 'head_of_head', 'chain_len', 'CL'])

#append all PL texts and features into one dataframe
for n in range(1,47):
# for n in range(1,2):

    print(n)

    #get story path
    storycopath = PLcopath +'/story' + str(n) + '.txt'
    storypath = PLpath + '/story' + str(n) + '.txt'
    storyid = 'story'+ str(n)

    #read in story
    corpus = data_utils.read_story(storycopath)

    # read in from intermediate files
    # list of features
    # features = ["CN", "Dep", "NER", "SS", "Triple", "WN"]

    # for f in features:
    #     #empty list
    #     feat = []
    #     with open(PLInter + f + 'FeatureBoolean'+'/Story' + str(n) + '.txt', 'r') as doc:
    #         for line in doc:
    #             feat.append(eval(line.rstrip()))
    
    #     corpus[f] = feat

    #get ss feature
    sslist = preprocess.semantic_subj(storypath)
    #remove leading The/A's in the sematic list
    sslist = [re.sub('^(The |A )','',s, flags=re.IGNORECASE) for s in sslist]
    pattern = '|'.join(sslist)
    pattern = pattern.replace('?|','')
    pattern = pattern.replace('!|','')
    pattern = pattern.replace('.|','')
    pattern = pattern.replace('(|','')
    pattern = pattern.replace('(','')
    pattern = pattern.replace(')','')
    
    #create binary flag variable for ss feat
    corpus['SS'] = corpus['head_of_head'].str.contains(pattern)
    corpus['SS'] = corpus['SS'].replace({True:1, False:0})

    #get ner feature
    nerlist = preprocess.ner_person(storypath)
    pattern = '|'.join(nerlist)
    pattern = pattern.replace('(','')
    pattern = pattern.replace(')','')
    pattern = pattern.replace('[','')
    pattern = pattern.replace(']','')

    #create binary flag variable for ss feat
    corpus['NER'] = corpus['head_of_head'].str.contains(pattern)
    corpus['NER'] = corpus['NER'].replace({True:1, False:0})


    #create binary flag variable for wn feat
    #get wordnet synset of head of chain
    wn_input = corpus['head_of_head'].apply(lambda word: pd.Series(wn.synsets(word)))
   
    #fill blanks with unrelated word to person
    wn_input[0] = wn_input[0].fillna(wn.synset('strong.a.01'))
   
    # get common synonym with person
    per = wn.synset('person.n.01')
    test = wn_input[0].apply(lambda syn: pd.Series(syn.lowest_common_hypernyms(per)))
   
    # test if head of chain related to person
    corpus['WN']= test[0]==per
    corpus['WN'] = corpus['WN'].replace({True:1, False:0})

    #get dp feat
    dplist = preprocess.dep_link(storypath)
    dplist = list(set(dplist))
    pattern='|'.join(dplist)
    #remove punct
    pattern = pattern.replace('?|','')
    pattern = pattern.replace('!|','')
    pattern = pattern.replace('.|','')
    pattern = pattern.replace('(|','')
    pattern = pattern.replace(')|','')
    pattern = pattern.replace('(','')
    pattern = pattern.replace(')','')

    #create binary flag variable for dp feat
    corpus['DP'] = corpus['head_of_head'].str.contains(pattern)
    corpus['DP'] = corpus['DP'].replace({True:1, False:0})

    #get triple feat
    tplist = preprocess.triple(storypath)
    tplist = list(set(tplist))
    #remove leading The/A's in the sematic list
    tplist = [re.sub('^(The |A )','',s, flags=re.IGNORECASE) for s in tplist]
    pattern='|'.join(tplist)
    pattern = pattern.replace('?|','')
    pattern = pattern.replace('!|','')
    pattern = pattern.replace('.|','')
    pattern = pattern.replace('(|','')
    pattern = pattern.replace(')|','')
    pattern = pattern.replace('(','')
    pattern = pattern.replace(')','')

    #create binary flag variable for ss feat
    corpus['TP'] = corpus['head_of_head'].str.contains(pattern)
    corpus['TP'] = corpus['TP'].replace({True:1, False:0})

    # get conceptnet feat
    urlreq = 'https://api.conceptnet.io/c/en/'+corpus['head_of_head']

    #default no presence of person mentioned
    corpus['CN'] = 0

    for i in range(len(urlreq)):

        #make request to concept net api
        response = requests.get(urlreq[i])
        obj = response.json()
        #get list of edges
        cnlist = [edge['@id'] for edge in obj['edges']]

        #if person is in list then flag
        if any('person' in s for s in cnlist):
            val = 1
            corpus['CN'][i]=val
    
    #create feature for freq of head of chain term in text
    tf_dict = preprocess.term_freq(storypath)
    corpus['TF'] = corpus['head_of_head'].map(tf_dict)

    #create feature that contains keyword extraction score from yake
    f = open(storypath, 'r', encoding='ISO-8859-1')
    text = f.read()
    f.close()  

    kw_extractor = yake.KeywordExtractor()
    keywords = dict(kw_extractor.extract_keywords(text))
    corpus['YK_SC'] = corpus['head_of_head'].map(keywords)
    corpus['YK_SC'] = corpus['YK_SC'].fillna(100)

    #append to dataframe
    PL_data = pd.concat([PL_data, corpus], ignore_index=True)

PL_data

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /home/eileen/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /home/eileen/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /home/eileen/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /home/eileen/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /home/eileen/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /home/eileen/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to

1


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  wn_input = corpus['head_of_head'].apply(lambda word: pd.Series(wn.synsets(word)))
  test = wn_input[0].apply(lambda syn: pd.Series(syn.lowest_common_hype

2


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Importing Features from Intermediate File

In [None]:
#import libraries

import os
import sys
import numpy as np
import torchtext
import pandas as pd
import re

from lib import data_utils, preprocess

#get path
src_path = os.getcwd()

datapath = src_path + '/Data/'
PLcopath = datapath + 'AnnotatedPLData/PLCoref'
PLpath = datapath + 'AnnotatedPLData/PLTexts'
CENcopath = datapath + 'AnnotatedCENData/CENCoref'
CENpath = datapath + 'AnnotatedCENData/CENCoref'
ONpath = datapath + 'AnnotatedONData/ONCoref'

PLInter = src_path + '/output/IntermediateFilesPL/'
CENInter = src_path + '/output/IntermediateFilesCEN/'
ONInter = src_path + '/output/IntermediateFilesON/'

#import data

#create empty data frame
PL_Int_data = pd.DataFrame(columns=['corpusID', 'character', 'animacy', 'coref_chain', 'chain_head', 'head_of_head', 'chain_len', 'CL'])

#append all PL texts and features into one dataframe
for n in range(1,47):
# for n in range(1,2):

    print(n)

    #get story path
    storycopath = PLcopath +'/story' + str(n) + '.txt'
    storypath = PLpath + '/story' + str(n) + '.txt'
    storyid = 'story'+ str(n)

    #read in story
    corpus = data_utils.read_story(storycopath)

    #read in from intermediate files
    #list of features
    features = ["CN", "Dep", "NER", "SS", "Triple", "WN"]

    for f in features:
        #empty list
        feat = []
        with open(PLInter + f + 'FeatureBoolean'+'/Story' + str(n) + '.txt', 'r') as doc:
            for line in doc:
                feat.append(eval(line.rstrip()))
    
        corpus[f] = feat
    
    #creat feature for term freq
    tf_dict = preprocess.term_freq(storypath)
    corpus['TF'] = corpus['head_of_head'].map(tf_dict)

    #create feature that contains keyword extraction score from yake
    f = open(storypath, 'r', encoding='ISO-8859-1')
    text = f.read()
    f.close()  

    kw_extractor = yake.KeywordExtractor()
    keywords = dict(kw_extractor.extract_keywords(text))
    corpus['YK_SC'] = corpus['head_of_head'].map(keywords)
    corpus['YK_SC'] = corpus['YK_SC'].fillna(100)

    #append to dataframe
    PL_Int_data = pd.concat([PL_Int_data, corpus], ignore_index=True)
PL_Int_data

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46


Unnamed: 0,corpusID,character,animacy,coref_chain,chain_head,head_of_head,chain_len,CL,CN,Dep,NER,SS,Triple,WN,TF,YK_SC
0,story1,1,1,"[ dragon , he , he , the dragon , He , hi...",dragon,dragon,43,2.499540,1.0,1.0,0.0,1.0,1.0,0.0,0.040000,0.015363
1,story1,1,1,"[ princess , the tsar 's daughter , her , h...",princess,princess,23,0.990763,1.0,1.0,0.0,1.0,0.0,1.0,0.014118,0.063038
2,story1,1,1,"[ tsar , tsar , father , tsar , her father...",tsar,tsar,9,-0.065380,1.0,1.0,0.0,1.0,1.0,1.0,0.011765,0.088675
3,story1,0,1,"[ princess' dog , a little dog that had follo...",princess' dog,dog,4,-0.442574,1.0,1.0,0.0,1.0,1.0,0.0,0.007059,0.173527
4,story1,1,1,"[ tsarina , mother , tsarina , tsarina ]",tsarina,tsarina,4,-0.442574,1.0,1.0,0.0,1.0,0.0,1.0,0.004706,100.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1907,story46,0,1,"[ the entire enemy army , the enemy army , h...",the entire enemy army,army,7,-0.308434,0.0,1.0,0.0,0.0,0.0,0.0,0.007293,0.056700
1908,story46,0,1,"[ this , the best solution ]",this,this,2,-0.604613,0.0,0.0,0.0,1.0,0.0,0.0,0.000000,100.000000
1909,story46,0,1,"[ a horse , a horse ]",a horse,horse,2,-0.604613,0.0,0.0,0.0,0.0,1.0,0.0,0.004862,100.000000
1910,story46,0,1,"[ an even better horse , his horse , his hor...",an even better horse,horse,4,-0.486142,0.0,0.0,0.0,0.0,1.0,0.0,0.004862,100.000000


Model

In [28]:
# simple model with their features

from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import pickle
from joblib import dump, load

data_y = PL_Int_data["character"].astype('int')
data_x = PL_Int_data[["CL", "CN", "Dep", "NER", "SS", "Triple", "WN", "animacy"]]

X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, random_state=1)

rbf_svc = svm.SVC(kernel = 'rbf', C=.5, gamma=1)

#fit
rbf_svc.fit(X_train, y_train)

#save
dump(rbf_svc, "models/PLjahan_model.joblib")
rbf_svc = load('models/PLjahan_model.joblib') 

cv_results = cross_validate(rbf_svc, X_train, y_train, cv = 10, scoring=('f1', 'accuracy'), return_train_score=True)

# print(cv_results['train_accuracy'])
# print(cv_results['test_accuracy'])
# print(cv_results['train_f1'])
# print(cv_results['test_f1'])

print("%0.2f accuracy with a standard deviation of %0.2f" % (cv_results['test_accuracy'].mean(), cv_results['test_accuracy'].std()))
print("%0.2f f1 score with a standard deviation of %0.2f" % (cv_results['test_f1'].mean(), cv_results['test_f1'].std()))
# cv = StratifiedShuffleSplit(n_splits=10, test_size=.2, random_state=42)

# C_range = np.logspace(-2,10,13)
# gamma_range = np.logspace(-9,3,13)

# grid = GridSearchCV(svm.SVC(kernel='rbf'), param_grid=dict(gamma = gamma_range, C= C_range), cv=cv)

# grid.fit(X_train,y_train)

# print(grid.best_params_, grid.best_score_)



0.90 accuracy with a standard deviation of 0.03
0.81 f1 score with a standard deviation of 0.06


In [29]:
# simple model with their features + term freq + yake score

from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import pickle
from joblib import dump, load

data_y = PL_Int_data["character"].astype('int')
data_x = PL_Int_data[["CL", "CN", "Dep", "NER", "SS", "Triple", "WN", "YK_SC"]]

X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, random_state=1)

rbf_svc_tf = svm.SVC(kernel = 'rbf', C=.5, gamma=1)

#fit
rbf_svc_tf.fit(X_train, y_train)

#save
dump(rbf_svc_tf, "models/PLjahan_model_tf.joblib")
rbf_svc_tf = load('models/PLjahan_model_tf.joblib') 

cv_results = cross_validate(rbf_svc_tf, X_train, y_train, cv = 10, scoring=('f1', 'accuracy', 'f1_macro'), return_train_score=True)

# print(cv_results['train_accuracy'])
# print(cv_results['test_accuracy'])
# print(cv_results['train_f1'])
# print(cv_results['test_f1'])

print("%0.2f accuracy with a standard deviation of %0.2f" % (cv_results['test_accuracy'].mean(), cv_results['test_accuracy'].std()))
print("%0.2f f1 score for character class with a standard deviation of %0.2f" % (cv_results['test_f1'].mean(), cv_results['test_f1'].std()))
# print("%0.2f f1 score with a standard deviation of %0.2f" % (cv_results['test_f1_macro'].mean(), cv_results['test_f1'].std()))
# print("%0.2f f1 score for non character class" % (2*cv_results['test_f1_macro'].mean() - cv_results['test_f1'].mean()))

0.90 accuracy with a standard deviation of 0.03
0.80 f1 score for character class with a standard deviation of 0.06


In [27]:
# simple model 

from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import pickle
from joblib import dump, load

PL_data = pd.read_csv('Data/PL.csv')

data_y = PL_data["character"].astype('int')
data_x = PL_data[["CL", "CN", "DP", "NER", "SS", "TP", "WN", "animacy"]]

X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, random_state=1)

rbf_svc_1 = svm.SVC(kernel = 'rbf', C=.5, gamma=1)

#fit
rbf_svc_1.fit(X_train, y_train)

#save
dump(rbf_svc_1, "models/PLallen_model.joblib")
rbf_svc_1 = load('models/PLallen_model.joblib') 

cv_results = cross_validate(rbf_svc_1, X_train, y_train, cv = 10, scoring=('f1', 'accuracy', 'f1_macro'), return_train_score=True)

# print(cv_results['train_accuracy'])
# print(cv_results['test_accuracy'])
# print(cv_results['train_f1'])
# print(cv_results['test_f1'])
# print(cv_results['test_f1_macro'])

print("%0.2f accuracy with a standard deviation of %0.2f" % (cv_results['test_accuracy'].mean(), cv_results['test_accuracy'].std()))
print("%0.2f f1 score for character class with a standard deviation of %0.2f" % (cv_results['test_f1'].mean(), cv_results['test_f1'].std()))
# print("%0.2f f1 score with a standard deviation of %0.2f" % (cv_results['test_f1_macro'].mean(), cv_results['test_f1'].std()))
# print("%0.2f f1 score fro non character class" % (2*cv_results['test_f1_macro'].mean() - cv_results['test_f1'].mean()))

KeyError: "['YK_SC'] not in index"

In [25]:
# simple model 

from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

data_y = PL_data["character"].astype('int')
data_x = PL_data[["CL", "CN", "DP", "NER", "SS", "TP", "WN", "TF"]]

X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, random_state=1)

rbf_svc_1tf = svm.SVC(kernel = 'rbf', C=.5, gamma=1)

#fit
rbf_svc_1tf.fit(X_train, y_train)

#save
dump(rbf_svc_1tf, "models/PLallen_model_tf.joblib")
rbf_svc_1tf = load('models/PLallen_model_tf.joblib') 

cv_results = cross_validate(rbf_svc_1tf, X_train, y_train, cv = 10, scoring=('f1', 'accuracy'), return_train_score=True)

# print(cv_results['train_accuracy'])
# print(cv_results['test_accuracy'])
# print(cv_results['train_f1'])
# print(cv_results['test_f1'])

print("%0.2f accuracy with a standard deviation of %0.2f" % (cv_results['test_accuracy'].mean(), cv_results['test_accuracy'].std()))
print("%0.2f f1 score with a standard deviation of %0.2f" % (cv_results['test_f1'].mean(), cv_results['test_f1'].std()))

KeyError: "['CN', 'DP', 'NER', 'SS', 'TP', 'WN', 'TF'] not in index"

In [None]:
PL_data.to_csv("Data/PL.csv")
PL_Int_data.to_csv("Data/PLInter.csv")