# **Getting started**  

setting up the required data files location from google drive 

In [1]:
#lodaing files folder from google drive 
#give file path in drive.mount('file path')
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#importing libraries 
import nltk
import os
import re
import numpy as np
import pandas as pd
from nltk.corpus import PlaintextCorpusReader,stopwords

# ***extracting data into data frame***

In [3]:
#extracting text content, name of the file, label from the file folder
# Get the file details
directory = []
file = []
title = []
text = []
label = []
datapath = '/content/drive/MyDrive/workingWithP/dataCatgorize' 
for dirname, _ , filenames in os.walk(datapath):
    #print('Directory: ', dirname)
    #print('Subdir: ', dirname.split('/')[-1])
    # remove the Readme.txt file
    # will not find file in the second iteration so we skip the error
    try:
        filenames.remove('README.TXT')
    except:
        pass
    for filename in filenames:
        directory.append(dirname)
        file.append(filename)
        label.append(dirname.split('/')[-1])
        fullpathfile = os.path.join(dirname,filename)
        with open(fullpathfile, 'r', encoding="utf8", errors='ignore') as infile:
            intext = ''
            firstline = True
            for line in infile:
                if firstline:
                    title.append(line.replace('\n',''))
                    firstline = False
                else:
                    intext = intext + ' ' + line.replace('\n','')
            text.append(intext)

In [4]:
#filtering requried information from extracted data that is text content and label
DATA_COLUMN = 'text'
LABEL_COLUMN = 'label'

fulldf = pd.DataFrame(list(zip(directory, file, title, text, label)), 
               columns =['directory', 'file', 'title', 'text', 'label'])

df = fulldf.filter(['text','label','file'], axis=1)
df.head()
df.tail()

Unnamed: 0,text,label,file
504,"CONSULTING AGREEMENT July 20, 2018 Gianluc...",Consulting Agreements,"KIROMICBIOPHARMA,INC_05_11_2020-EX-10.23-CONSU..."
505,1 5. Confidentiality. 5.1 Acknowledgmen...,Consulting Agreements,"MEDALISTDIVERSIFIEDREIT,INC_05_18_2020-EX-10.1..."
506,"IMMUNOTOLERANCE, INC. CONSULTING AGREEMENT ...",Consulting Agreements,PANDIONTHERAPEUTICSHOLDCOLLC_05_22_2020-EX-10....
507,THIS CONSULTING AGREEMENT is made and entere...,Consulting Agreements,SPHERE3DCORP_06_24_2020-EX-10.12-CONSULTING AG...
508,"2 (c) The Consultant will faithfully, ho...",Consulting Agreements,SLINGERBAGINC_05_27_2020-EX-10.7-CONSULTING AG...


In [5]:
#preview of the file names 
print(df['file'])

0      ENTERPRISEPRODUCTSPARTNERSLP_07_08_1998-EX-10....
1      ENERGYXXILTD_05_08_2015-EX-10.13-Transportatio...
2      MARTINMIDSTREAMPARTNERSLP_01_23_2004-EX-10.3-T...
3      BELLRINGBRANDS,INC_02_07_2020-EX-10.18-MASTER ...
4      BIOFRONTERAAG_04_29_2019-EX-4.17-SUPPLY AGREEM...
                             ...                        
504    KIROMICBIOPHARMA,INC_05_11_2020-EX-10.23-CONSU...
505    MEDALISTDIVERSIFIEDREIT,INC_05_18_2020-EX-10.1...
506    PANDIONTHERAPEUTICSHOLDCOLLC_05_22_2020-EX-10....
507    SPHERE3DCORP_06_24_2020-EX-10.12-CONSULTING AG...
508    SLINGERBAGINC_05_27_2020-EX-10.7-CONSULTING AG...
Name: file, Length: 509, dtype: object


In [6]:
file_names = df['file']
df.shape

(509, 3)

**Text Pre-Processing**

In [7]:
#Importing Libraries
from bs4 import BeautifulSoup
import nltk

In [8]:
#general text processing 
# removing punctation and white spaces 
#setting the text to lower case
def text_preprocess(text):
  text=text.lower()
  text=re.compile('[/(){}\[\]\|@,;]').sub(' ', text)
  return text

In [9]:
df['text']=df['text'].apply(text_preprocess)
df['text'].head()

0         enterprise logo appears here   enterprise...
1                transportation agreement   table o...
2      transportation services agreement  this mari...
3      certain confidential information contained i...
4          supply agreement this supply agreement  ...
Name: text, dtype: object

*punctuation removal*

In [10]:
# importing library that has punctuation list
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [11]:
#defining a function to remove punctuation 
def remove_all_punctuation(text):
  punctuation_remove= "".join([i for i in text if i not in string.punctuation])
  return punctuation_remove


In [12]:
#calling punctuation_remove function to clean the data 
df['text'] = df['text'].apply(lambda x:remove_all_punctuation(x))
df['text']

0           enterprise logo appears here   enterprise...
1                  transportation agreement   table o...
2        transportation services agreement  this mari...
3        certain confidential information contained i...
4            supply agreement this supply agreement  ...
                             ...                        
504      consulting agreement  july 20  2018  gianluc...
505      1      5 confidentiality 51 acknowledgment o...
506      immunotolerance  inc  consulting agreement  ...
507      this consulting agreement is made and entere...
508          2  c  the consultant will faithfully  ho...
Name: text, Length: 509, dtype: object

In [13]:
#space removel
df['text'] = df['text'].str.replace('\d+', '')

*lowering the text*

In [14]:
#setting the text to lower case
df['text'] = df['text'].apply(lambda x: x.lower())

*HTML removing*

In [15]:
#importing package from lib
from bs4 import BeautifulSoup 

In [16]:
#appling beautiful soup on the text 
df['text'] = [BeautifulSoup(str(text)).get_text() for text in df['text'].tolist()]

*unicode removal*

In [17]:
#importing library 
import unicodedata

In [18]:
#appling unicode on the text data
df['text'] = [unicodedata.normalize('NFKD', str(text)).encode('ascii', 'ignore').decode('utf-8', 'ignore') for text in df['text'].tolist()]

contractions removing

In [19]:
#installing contraction library
!pip install contractions

Collecting contractions
  Downloading contractions-0.0.58-py2.py3-none-any.whl (8.0 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.21-py2.py3-none-any.whl (7.5 kB)
Collecting pyahocorasick
  Downloading pyahocorasick-1.4.2.tar.gz (321 kB)
[K     |████████████████████████████████| 321 kB 4.0 MB/s 
[?25hCollecting anyascii
  Downloading anyascii-0.3.0-py3-none-any.whl (284 kB)
[K     |████████████████████████████████| 284 kB 49.0 MB/s 
[?25hBuilding wheels for collected packages: pyahocorasick
  Building wheel for pyahocorasick (setup.py) ... [?25l[?25hdone
  Created wheel for pyahocorasick: filename=pyahocorasick-1.4.2-cp37-cp37m-linux_x86_64.whl size=85456 sha256=cfd28ef8d0e20dc5a44123f782beb2a31b117bf60c3fda6662364891a13b495c
  Stored in directory: /root/.cache/pip/wheels/25/19/a6/8f363d9939162782bb8439d886469756271abc01f76fbd790f
Successfully built pyahocorasick
Installing collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully install

In [20]:
#importing library
import contractions
from contractions import contractions_dict

In [21]:
#appling the contactions list on the text data to remove all contractions
for contraction, expansion in contractions_dict.items():
  df['text'] = df['text'].str.replace(contraction, expansion)

*tokenization*

In [22]:
#importing library for tokenization
import re
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [23]:
#defining function for tokenization
def tokenization(text):
  tokens = word_tokenize(text)
  return tokens

In [24]:
#calling tokenization function to tokenize each word in the text content
df['text'] = df['text'].apply(str)
df['text'] = df['text'].apply(lambda x: tokenization(x))
df['text']

0      [enterprise, logo, appears, here, enterprise, ...
1      [transportation, agrethement, table, of, conte...
2      [transportation, services, agrethement, this, ...
3      [certain, confidential, information, contained...
4      [supply, agrethement, this, supply, agrethemen...
                             ...                        
504    [consulting, agrethement, july, gianluca, roti...
505    [confidentiality, acknowledgment, of, propriet...
506    [immunotolerance, inc, consulting, agrethement...
507    [this, consulting, agrethement, is, made, and,...
508    [c, the, consultant, will, faithfully, honestl...
Name: text, Length: 509, dtype: object

In [25]:
#preview of tokens from one file
df['text'][0]

['enterprise',
 'logo',
 'appears',
 'here',
 'enterprise',
 'transportation',
 'company',
 'a',
 'division',
 'of',
 'enterprise',
 'products',
 'company',
 'po',
 'box',
 'phone',
 'contract',
 'no',
 'houston',
 'tx',
 'date',
 'june',
 'transportation',
 'contract',
 'this',
 'contract',
 'is',
 'entered',
 'into',
 'by',
 'and',
 'between',
 'enterprise',
 'transportation',
 'company',
 'a',
 'division',
 'of',
 'enterprise',
 'products',
 'company',
 'carrier',
 'and',
 'enterprise',
 'products',
 'operating',
 'lp',
 'a',
 'delaware',
 'limited',
 'partnership',
 'po',
 'box',
 'houston',
 'tx',
 'shipper',
 'address',
 'city',
 'state',
 'shipper',
 'is',
 'engaged',
 'in',
 'business',
 'as',
 'a',
 'manufacturer',
 'distributor',
 'or',
 'dealer',
 'of',
 'chthemicals',
 'or',
 'petroleum',
 'products',
 'commodities',
 'and',
 'shipper',
 'requires',
 'transportation',
 'of',
 'commodities',
 'in',
 'intrastate',
 'interstate',
 'or',
 'foreign',
 'commerce',
 'carrier',
 'i

*stop word removal*

In [26]:
# nltk lib has already imported 
#stop words in given library
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
stopwords[0:20]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his']

In [27]:
#defining function for stop words
def remove_stopwords(text):
  output = [i for i in text if i not in stopwords]
  return output

In [28]:
#appling the stopwords function to the text content
df['text'] = df['text'].apply(lambda x:remove_stopwords(x))
df['text']

0      [enterprise, logo, appears, enterprise, transp...
1      [transportation, agrethement, table, contents,...
2      [transportation, services, agrethement, marchn...
3      [certain, confidential, information, contained...
4      [supply, agrethement, supply, agrethement, agr...
                             ...                        
504    [consulting, agrethement, july, gianluca, roti...
505    [confidentiality, acknowledgment, proprietary,...
506    [immunotolerance, inc, consulting, agrethement...
507    [consulting, agrethement, made, entered, june,...
508    [c, consultant, faithfully, honestly, diligent...
Name: text, Length: 509, dtype: object

*lemmatization*

In [29]:
#importing lemmatization from lib
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [30]:
#defining functioon as object
wordnet_lemmatizer = WordNetLemmatizer()

In [31]:
#defining the lemmatization function
def lemmatizer(text):
  lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
  return lemm_text

In [32]:
#appling lemmization function to text content
df['text'] = df['text'].apply(lambda x: lemmatizer(x))

*null values removing*

In [33]:
# removing white spaces or null values from the text content 
df = df[~(df.text.str.strip() == '')]
df = df.dropna().reset_index(drop=True)

# **splitting data**

splitting corpus into two parts for testing and training the model.
training - 80% 
test - 20 % 
so the corpus has to spilt in the ratio of 80:20.


In [34]:
# importing spillitng function from sklearn library
from sklearn.model_selection import train_test_split

In [35]:
#dividing up labels and data for testing as well as training
train_corpus_set, test_corpus_set, train_label_names_set, test_label_names_set = train_test_split(np.array(df['text'].apply(lambda x:np.str_(x))),
np.array(df['label']), 
test_size=0.20, 
random_state=42)

train_corpus_set.shape, test_corpus_set.shape

((407,), (102,))

# **TF-IDF and classification models**

## **features extraction**

## *TF-IDF* 

In [36]:
#importing tfidf package from skleran
from sklearn.feature_extraction.text import TfidfVectorizer

In [37]:
#normilization of TF-IDF algorithm
tfidf_tv = TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0)

In [38]:
#appling the TF-IDF algorithm to extract features 
tfidf_tv_train_features = tfidf_tv.fit_transform(train_corpus_set)
tfidf_tv_test_features = tfidf_tv.transform(test_corpus_set)

In [39]:
print('TF-IDF model:\n','Train features:', tfidf_tv_train_features.shape,'\n', 'Test features:', tfidf_tv_test_features.shape)

TF-IDF model:
 Train features: (407, 23599) 
 Test features: (102, 23599)


## **classification model and evaluation**

## *SVM*

In [40]:
# importing SVM algorithm from sklearn library
from sklearn.svm import LinearSVC

In [41]:
#applying the SVM algorithm on training data to train the model
svm_model = LinearSVC(penalty='l2', C=1, random_state=42)
svm_model.fit(tfidf_tv_train_features, train_label_names_set)

LinearSVC(C=1, random_state=42)

In [42]:
#classification report of SVM Model
from sklearn.metrics import classification_report
y_pred = svm_model.predict(tfidf_tv_test_features)
print(classification_report(test_label_names_set, y_pred))

                         precision    recall  f1-score   support

      Agency Agreements       0.50      0.50      0.50         2
            Co_Branding       0.67      0.50      0.57         4
          Collaboration       0.75      0.86      0.80         7
  Consulting Agreements       0.75      1.00      0.86         3
            Development       0.67      0.80      0.73         5
            Distributor       0.90      0.82      0.86        11
            Endorsement       0.00      0.00      0.00         1
  Endorsement Agreement       0.00      0.00      0.00         1
              Franchise       1.00      1.00      1.00         3
                Hosting       0.60      0.60      0.60         5
                     IP       1.00      1.00      1.00         1
          Joint Venture       0.00      0.00      0.00         1
 Joint Venture _ Filing       0.40      0.67      0.50         3
     License_Agreements       0.33      0.60      0.43         5
            Maintenance 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## *logestic regression*

In [43]:
#impoorting logistic regression algorithm from sklearn library
import sklearn.linear_model as sk
from sklearn.linear_model import LogisticRegression


In [44]:
#applying the SVM algorithm on training data to train the model
lr_model = LogisticRegression(penalty='l2', solver='lbfgs', multi_class='ovr',
                        max_iter=1000, C=1, random_state=42)
lr_model.fit(tfidf_tv_train_features, train_label_names_set)

LogisticRegression(C=1, max_iter=1000, multi_class='ovr', random_state=42)

In [45]:
#testing and classification report of LOgestic regression 
y_pred = lr_model.predict(tfidf_tv_test_features)
print(classification_report(test_label_names_set, y_pred))

                         precision    recall  f1-score   support

      Agency Agreements       0.00      0.00      0.00         2
            Co_Branding       1.00      0.50      0.67         4
          Collaboration       1.00      0.43      0.60         7
  Consulting Agreements       1.00      1.00      1.00         3
            Development       0.36      0.80      0.50         5
            Distributor       0.90      0.82      0.86        11
            Endorsement       0.00      0.00      0.00         1
  Endorsement Agreement       0.00      0.00      0.00         1
              Franchise       1.00      1.00      1.00         3
                Hosting       0.50      0.20      0.29         5
                     IP       1.00      1.00      1.00         1
          Joint Venture       0.00      0.00      0.00         1
 Joint Venture _ Filing       0.00      0.00      0.00         3
     License_Agreements       0.31      0.80      0.44         5
            Maintenance 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# **word2vec and other models**

# **features extraction**

## word2vec

In [46]:
#importing gensim library to get Word2Vec algorithm
import gensim

In [47]:
#tokenization for the model
tokenized_train = [nltk.tokenize.word_tokenize(text)
                   for text in train_corpus_set]
tokenized_test = [nltk.tokenize.word_tokenize(text)
                   for text in test_corpus_set]

In [48]:
#setting up the word2vec alogithm 
w2v_features_num = 300
w2v_model = gensim.models.Word2Vec(tokenized_train, 
            size=w2v_features_num, 
            window=10,
            min_count=2, 
            sg = 0, 
            iter=5, workers=5) 

In [49]:
#features extraction using word2vec
#fitting the tokenized training data to the algorithm 
def document_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    
    def average_word_vectors(words, model, vocabulary, num_features):
        feature_vector = np.zeros((num_features,), dtype="float64")
        nwords = 0.
        
        for word in words:
            if word in vocabulary: 
                nwords = nwords + 1.
                feature_vector = np.add(feature_vector, model.wv[word])
        if nwords:
            feature_vector = np.divide(feature_vector, nwords)

        return feature_vector

    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)
    
# generate averaged word vector features from word2vec model
avg_wv_train_features = document_vectorizer(corpus=tokenized_train, model=w2v_model,
                                                     num_features=w2v_features_num)
avg_wv_test_features = document_vectorizer(corpus=tokenized_test, model=w2v_model,
                                                    num_features=w2v_features_num)

## **prediction model and  evaluation**

## logistic regression

In [50]:
#setting up the dataframe to the logistic regression algorithm 
#training the model with the training data
lr_model = LogisticRegression(penalty='l2', solver='lbfgs', multi_class='ovr',
                        max_iter=1000, C=1, random_state=42)
lr_model.fit(avg_wv_train_features, train_label_names_set)

LogisticRegression(C=1, max_iter=1000, multi_class='ovr', random_state=42)

In [51]:
# testing the model and classification report 
y_pred = lr_model.predict(avg_wv_test_features)
print(classification_report(test_label_names_set, y_pred))

                         precision    recall  f1-score   support

      Agency Agreements       0.00      0.00      0.00         2
            Co_Branding       1.00      0.50      0.67         4
          Collaboration       0.50      0.14      0.22         7
  Consulting Agreements       0.00      0.00      0.00         3
            Development       0.17      0.20      0.18         5
            Distributor       0.78      0.64      0.70        11
            Endorsement       0.00      0.00      0.00         1
  Endorsement Agreement       0.00      0.00      0.00         1
              Franchise       1.00      1.00      1.00         3
                Hosting       0.50      0.20      0.29         5
                     IP       1.00      1.00      1.00         1
          Joint Venture       0.00      0.00      0.00         1
 Joint Venture _ Filing       0.75      1.00      0.86         3
     License_Agreements       0.56      1.00      0.71         5
            Maintenance 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## SVM

In [52]:
#setting up the dataframe to the SVM algorithm 
#training the model with the training data
svm_model = LinearSVC(penalty='l2', C=1, random_state=42)
svm_model.fit(avg_wv_train_features, train_label_names_set)



LinearSVC(C=1, random_state=42)

In [53]:
#testing the model 
y_pred = svm_model.predict(avg_wv_test_features)

In [54]:
#classification model
print(classification_report(test_label_names_set, y_pred))

                         precision    recall  f1-score   support

   Affiliate_Agreements       0.00      0.00      0.00         0
      Agency Agreements       0.20      0.50      0.29         2
            Co_Branding       0.50      0.75      0.60         4
          Collaboration       0.57      0.57      0.57         7
  Consulting Agreements       1.00      0.67      0.80         3
            Development       0.40      0.40      0.40         5
            Distributor       0.88      0.64      0.74        11
            Endorsement       0.00      0.00      0.00         1
  Endorsement Agreement       0.00      0.00      0.00         1
              Franchise       1.00      1.00      1.00         3
                Hosting       0.50      0.20      0.29         5
                     IP       0.50      1.00      0.67         1
          Joint Venture       0.00      0.00      0.00         1
 Joint Venture _ Filing       0.60      1.00      0.75         3
     License_Agreements 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
