# Using only Spacy

In [2]:
import spacy
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

TRAIN_LABELS = 'participants/train/labels/labels.csv'
TRAIN_TEXT = 'participants/train/extracted_data/extract_combined.csv'
TEST_TEXT = 'participants/test/extracted_data/extract_combined.csv'

### Importing Data

In [3]:
train_labels_df = pd.read_csv(TRAIN_LABELS, usecols=['document_name','is_fitara'])
train_text_df = pd.read_csv(TRAIN_TEXT)
test_df = pd.read_csv(TEST_TEXT)

train_labels_df.is_fitara = train_labels_df.is_fitara.eq('Yes').mul(1)

fitara_df = pd.merge(
    train_labels_df, 
    train_text_df, 
    on='document_name', 
    how='inner'
)

raw_data = pd.read_pickle('participants/train_df_ORG_GPE_DATE.pkl')
word_count_df = pd.read_pickle('participants/get_only_ORG.pkl')
raw_data['word_count'] = word_count_df.word_count
fitara_df = raw_data[(raw_data.word_count > 100)]
fitara_df.text_org = fitara_df.text_org[]
fitara_df.head()

Unnamed: 0,document_name,is_fitara,text,text_org,word_count
0,04-42RFP.pdf,0,\n \nOMB No. 0990-0115 \n \n Electronic Requ...,Proposal Purchase Authority RFP NIH November ...,1689
7,1_-_Terms_&_Conditions.docx,0,"52.203-98, Prohibition on Contracting with Ent...",2015-02 Division E 2015 Standard Form 2015-02...,607
8,1_-_Terms_&_Conditions_for_posting.docx,0,"52.203-98, Prohibition on Contracting with Ent...",2015-02 Division E 2015 Standard Form 2015-02...,521
9,1_-_Terms_&_Conditions_for_Posting_NOI-RML-E-1...,0,"52.203-98, Prohibition on Contracting with Ent...",2015-02 Division E 2015 Standard Form 2015-02...,708
10,1_-_Terms_3&_Conditions_attached_to_posting.docx,0,"52.203-98, Prohibition on Contracting with Ent...",2015-02 Division E 2015 Standard Form 2015-02...,681


### Tokenizing Data

In [4]:
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
# nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = spacy.load('en') #English()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    
    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

def get_only_ORG(text):
    doc = nlp(text)
    ORGS = ""
    for ent in doc.ents:
        if ent.label_ == 'ORG':
            ORGS += " "
            ORGS += ent.text
    if len(ORGS) == 0:
        print("Not found")
    print(text)
    print('====================================================================================================')
    return ORGS

In [5]:
# tokens = spacy_tokenizer(fitara_df.text[0])

### Custom Transformer

In [6]:
# Custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

# Basic function to clean the text
def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower()

### Feature Engineering

In [7]:
bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(2,3))

tfidf_vector = TfidfVectorizer(min_df=5, analyzer='word', ngram_range=(2,3), token_pattern='\w{1,}' ,max_features=30000)

### Spliting data into Trainig and Test set

In [11]:
from sklearn.model_selection import train_test_split

X = fitara_df['text_org'] # the features we want to analyze
ylabels = fitara_df['is_fitara'] # the labels, or answers, we want to test against

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3, random_state = 20)

### Creating Pipling and Generating Model

In [24]:
# Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

# classifier = LogisticRegression()
classifier = MultinomialNB(class_prior=[.5, .5])
# classifier = SGDClassifier(max_iter=100, verbose=0)

# Create pipeline using Bag of Words
pipe = Pipeline([#("cleaner", predictors()),
                 ('vectorizer', bow_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)



Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(2, 3), preprocessor=None, stop_words=None,
       ...m_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))])

### Evaluating Model

In [25]:
from sklearn import metrics
# Predicting with a test dataset
def evaluate_model():
    predicted = pipe.predict(X_test)
    # Model Accuracy
    print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
    print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
    print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))


In [26]:
evaluate_model()

Logistic Regression Accuracy: 0.8037974683544303
Logistic Regression Precision: 0.734375
Logistic Regression Recall: 0.7704918032786885
