#Project_Draft_J_Parse_Classifier 

This notebook implements a classification model on syntactic parse trees to identify actions.


##Set up Drive

In [2]:
#authorize Colab to access Drive
from google.colab import drive
drive.mount('/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /gdrive


##Imports

In [0]:
import csv
import os

import tabulate
import numpy as np

from collections import defaultdict

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.decomposition import TruncatedSVD
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.naive_bayes import MultinomialNB

import gensim
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.matutils import sparse2full

## Define Vectorizer Class

In [0]:
class GensimTfidfVectorizer(BaseEstimator, TransformerMixin):

    def __init__(self, dirpath=".", tofull=False):
        """
        Pass in a directory that holds the lexicon in corpus.dict and the
        TFIDF model in tfidf.model (for now).

        Set tofull = True if the next thing is a Scikit-Learn estimator
        otherwise keep False if the next thing is a Gensim model.
        """
        self._lexicon_path = os.path.join(dirpath, "corpus.dict")
        self._tfidf_path = os.path.join(dirpath, "tfidf.model")

        self.lexicon = None
        self.tfidf = None
        self.tofull = tofull

        self.load()

    def load(self):

        if os.path.exists(self._lexicon_path):
            self.lexicon = Dictionary.load(self._lexicon_path)

        if os.path.exists(self._tfidf_path):
            self.tfidf = TfidfModel().load(self._tfidf_path)

    def save(self):
        self.lexicon.save(self._lexicon_path)
        self.tfidf.save(self._tfidf_path)

    def fit(self, documents, labels=None):
        self.lexicon = Dictionary(documents)
        self.tfidf = TfidfModel([self.lexicon.doc2bow(doc) for doc in documents], id2word=self.lexicon)
        self.save()
        return self

    def transform(self, documents):
        def generator():
            for document in documents:
                vec = self.tfidf[self.lexicon.doc2bow(document)]
                if self.tofull:
                    yield sparse2full(vec, len(self.lexicon))
                else:
                    yield vec
        return list(generator())


## Define CorpusLoader Class to manage the folds for cross-validation

In [0]:
import numpy as np
from sklearn.model_selection import KFold

class CorpusLoader(object):
    """
    Splits a list of vectors and their labels
    """
    def __init__(self, vectors, labels, splits=12):
        self.folds = KFold(n_splits=splits, shuffle=True)
        self.X = np.asarray(vectors)
        self.y = np.asarray(labels)

    def documents(self, idx=None):
        #temp = [doc for doc in self.X[idx]]
        #print('docs: ', temp)
        #return(temp)
        return [doc for doc in self.X[idx]]

    def labels(self, idx):
        return self.y[idx]

    def __iter__(self):
        for train_index, test_index in self.folds.split(self.X):
            X_train = self.documents(train_index)
            y_train = self.labels(train_index)

            X_test = self.documents(test_index)
            y_test = self.labels(test_index)

            yield X_train, X_test, y_train, y_test


## Define Create_Pipeline Function

not currently in use


In [0]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline

def create_pipeline(estimator, reduction=False):

    steps = []
    steps.append(('vectorization', GensimTfidfVectorizer(".", True)))

    # Apply Truncated SVD if requested
    if reduction:
        steps.append((
            'reduction', TruncatedSVD(n_components=50)
        ))

    # Add the estimator
    steps.append(('classifier', estimator))
    
    # Actual Pipeline object is created here
    return Pipeline(steps)


##Main section

### Read in the Pickled Parses and Labels

Non-actions are duplicated to approximate the same number as the Actions, so the training data is balanced.

In [0]:
# These functions are provided for normalization, but may not be needed. 
# The tf-idf vectorization may automatically take care of de-emphasizing sentence-final punctuation.

# is-punct function recognizes punctuation 
def is_punct(token):
    return token.startswith(('LEFT_punct_', 'RIGHT_punct_'))

# Normalizer function strips off right hand punctuation, since this occurs in almost every sentence
def normalize(sentence):
    if is_punct(sentence[-1]):
        return sentence[:-1]

    else:    
        return sentence

In [0]:
# location of pickle files
actions_file = '/gdrive/My Drive/Colab Notebooks/3666 ANLP/Project/Climate Change Docs - Actions.pkl'
non_actions_file = '/gdrive/My Drive/Colab Notebooks/3666 ANLP/Project/Climate Change Docs - Non-Actions.pkl'

#initialize accumulators
corpus = []
labels = []

with open(actions_file, 'r', encoding="utf8", errors='ignore') as f:
    reader = csv.reader(f, delimiter=' ')
    for row in reader:
#        corpus.append(normalize(row)) ## normalization may not be needed
        corpus.append(row)
        labels.append('action')

with open(non_actions_file, 'r', encoding="utf8", errors='ignore') as f:
    reader = csv.reader(f, delimiter=' ')
    for row in reader:
#        row = normalize(row)
        corpus.append(row)
        corpus.append(row)
        labels.append('non_action')
        labels.append('non_action')


In [0]:
for i in range(len(corpus)):
  print(labels[i], corpus[i])

action ['ROOT_self_VB', 'LEFT_aux_VB', 'LEFT_nsubj_NN', 'RIGHT_dobj_NN', 'RIGHT_advcl_VB', 'RIGHT_punct_.']
action ['ROOT_self_VB', 'LEFT_neg_RB', 'RIGHT_dobj_NN', 'RIGHT_prep_IN', 'RIGHT_advcl_VB', 'RIGHT_punct_.']
action ['ROOT_self_VB', 'RIGHT_dobj_NN', 'RIGHT_advcl_VB', 'RIGHT_punct_.']
action ['ROOT_self_VB', 'LEFT_nsubj_NNS', 'LEFT_punct_,', 'LEFT_advcl_VBN', 'LEFT_punct_,', 'LEFT_aux_TO', 'RIGHT_dobj_NNS', 'RIGHT_prep_IN', 'RIGHT_punct_.']
action ['ROOT_self_VB', 'RIGHT_dobj_NN', 'RIGHT_punct_,', 'RIGHT_advcl_VBG', 'RIGHT_punct_.']
action ['ROOT_self_VB', 'RIGHT_dobj_NNS', 'RIGHT_advcl_VB', 'RIGHT_punct_.']
action ['ROOT_self_VB', 'LEFT_advcl_JJ', 'LEFT_punct_,', 'RIGHT_dobj_NNS', 'RIGHT_cc_CC', 'RIGHT_conj_VB', 'RIGHT_punct_.']
action ['ROOT_self_VB', 'RIGHT_prep_IN', 'RIGHT_cc_CC', 'RIGHT_conj_VB', 'RIGHT_punct_.']
action ['ROOT_self_VB', 'LEFT_csubj_VB', 'LEFT_aux_VBP', 'LEFT_neg_RB', 'RIGHT_dobj_NNS', 'RIGHT_punct_.']
action ['ROOT_self_NNS', 'LEFT_amod_JJ', 'RIGHT_cc_CC', '

###Logistic Regression


In [0]:
# where to save the models
%cd '/gdrive/My Drive/Colab Notebooks/3666 ANLP/Project'

# tiny corpus for testing
#docs=corpus[0:20]+corpus[-20:-1]
#labs=labels[0:20]+labels[-20:-1]

# whole corpus
docs=corpus
labs=labels

# Vectorizer
v=GensimTfidfVectorizer(".", True) 
vecs=v.fit_transform(docs)

# K-fold splitter for cross-validation
loader = CorpusLoader(vecs, labs, 10) 

# Storage for all our model metrics
#fields = ['precision', 'recall', 'accuracy', 'f1']
#scores = defaultdict(list)
#for f in fields:
#    scores[f]=[]

for X_train, X_test, y_train, y_test in loader:
    m=LogisticRegression()
    m.fit(X_train, y_train)
    y_pred=m.predict(X_test)
    #score = accuracy_score(y_test, y_pred)
    #scores.append(score)

    print(classification_report(y_test, y_pred))

    # Add scores to our scores
    #scores['precision'].append(precision_score(y_test, y_pred))
    #scores['recall'].append(recall_score(y_test, y_pred))
    #scores['accuracy'].append(accuracy_score(y_test, y_pred))
    #scores['f1'].append(f1_score(y_test, y_pred))

#print("Results for model {}".format(m))
#print("  Precision: {:0.3f}".format(np.mean(scores['precision'])))
#print("  Recall:    {:0.3f}".format(np.mean(scores['recall'])))
#print("  Accuracy:  {:0.3f}".format(np.mean(scores['accuracy'])))
#print("  F1:        {:0.3f}".format(np.mean(scores['f1'])))    

/gdrive/My Drive/Colab Notebooks/3666 ANLP/Project


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


              precision    recall  f1-score   support

      action       0.78      0.83      0.80        76
  non_action       0.84      0.79      0.81        84

    accuracy                           0.81       160
   macro avg       0.81      0.81      0.81       160
weighted avg       0.81      0.81      0.81       160

              precision    recall  f1-score   support

      action       0.81      0.83      0.82        83
  non_action       0.81      0.79      0.80        77

    accuracy                           0.81       160
   macro avg       0.81      0.81      0.81       160
weighted avg       0.81      0.81      0.81       160

              precision    recall  f1-score   support

      action       0.73      0.80      0.76        70
  non_action       0.83      0.77      0.80        90

    accuracy                           0.78       160
   macro avg       0.78      0.78      0.78       160
weighted avg       0.79      0.78      0.78       160

              preci



### Read in the Pickled Parses and Labels

Non-actions are duplicated to approximate the same number as the Actions, so the training data is balanced.

In [0]:
# location of pickle files
actions_file = '/gdrive/My Drive/Colab Notebooks/3666 ANLP/Project/Climate Change Docs - Actions.pkl'
non_actions_file = '/gdrive/My Drive/Colab Notebooks/3666 ANLP/Project/Climate Change Docs - Non-Actions.pkl'

#initialize accumulators
corpus = []
labels = []

with open(actions_file, 'r', encoding="utf8", errors='ignore') as f:
    reader = csv.reader(f, delimiter=' ')
    for row in reader:
        corpus.append(row)
        labels.append('action')

with open(non_actions_file, 'r', encoding="utf8", errors='ignore') as f:
    reader = csv.reader(f, delimiter=' ')
    for row in reader:
        corpus.append(row)
        corpus.append(row)
        labels.append('non_action')
        labels.append('non_action')


In [7]:
len(corpus)

1596

###Logistic Regression with Truncated SVD

In [16]:
# where to save the models
%cd '/gdrive/My Drive/Colab Notebooks/3666 ANLP/Project'

# tiny corpus for testing
#docs=corpus[0:20]+corpus[-20:-1]
#labs=labels[0:20]+labels[-20:-1]

# whole corpus
docs=corpus
labs=labels

# Vectorizer
v=GensimTfidfVectorizer(".", True) 
vecs=v.fit_transform(docs)

# Truncated SVD
t=TruncatedSVD(n_components=5)
vecs_reduced=t.fit_transform(vecs)

# K-fold splitter for cross-validation
loader = CorpusLoader(vecs_reduced, labs, 10) 

for X_train, X_test, y_train, y_test in loader:
    m=LogisticRegression()
    m.fit(X_train, y_train)
    y_pred=m.predict(X_test)

    print(classification_report(y_test, y_pred))


/gdrive/My Drive/Colab Notebooks/3666 ANLP/Project
              precision    recall  f1-score   support

      action       0.66      0.64      0.65        80
  non_action       0.65      0.68      0.66        80

    accuracy                           0.66       160
   macro avg       0.66      0.66      0.66       160
weighted avg       0.66      0.66      0.66       160

              precision    recall  f1-score   support

      action       0.75      0.65      0.70        89
  non_action       0.63      0.73      0.68        71

    accuracy                           0.69       160
   macro avg       0.69      0.69      0.69       160
weighted avg       0.70      0.69      0.69       160

              precision    recall  f1-score   support

      action       0.72      0.56      0.63        93
  non_action       0.53      0.70      0.61        67

    accuracy                           0.62       160
   macro avg       0.63      0.63      0.62       160
weighted avg       0.64

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


              precision    recall  f1-score   support

      action       0.70      0.71      0.71        77
  non_action       0.73      0.71      0.72        83

    accuracy                           0.71       160
   macro avg       0.71      0.71      0.71       160
weighted avg       0.71      0.71      0.71       160

              precision    recall  f1-score   support

      action       0.65      0.70      0.67        73
  non_action       0.73      0.68      0.70        87

    accuracy                           0.69       160
   macro avg       0.69      0.69      0.69       160
weighted avg       0.69      0.69      0.69       160

              precision    recall  f1-score   support

      action       0.78      0.71      0.74        75
  non_action       0.76      0.82      0.79        84

    accuracy                           0.77       159
   macro avg       0.77      0.76      0.76       159
weighted avg       0.77      0.77      0.77       159

              preci



# Extra stuff below this line is not used or tested

In [0]:
## Now we can quickly Generate models as follows:

# Here "models" is storing Pipeline objects 
models = []
#for form in (LogisticRegression, MultinomialNB, SGDClassifier): ##MultinomialNB removed because it was generating an error
for form in (LogisticRegression, SGDClassifier):
    #models.append(create_pipeline(form(), True))
    models.append(create_pipeline(form(), False))
