In [1]:
import sklearn.datasets as skd
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
import string
from nltk.stem import WordNetLemmatizer
import numpy as np
import re
from nltk.stem import SnowballStemmer

In [2]:
categories = ['0', '1']
ds = skd.load_files("./dataset/", categories=categories, encoding="UTF-8")

In [3]:
ds.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [4]:
ds.target_names

['0', '1']

In [5]:
type(ds.data)

list

In [6]:
ds.data[0]

'\n\n\n\n\n\n\nEX-10.III.H.A\n\n\n\n\n\n\nExhibit\xa010. (iii) (H) (a)\n\n\n\n\nAMENDMENT TO AMENDED AND RESTATED EMPLOYMENT AGREEMENT\n\n\n\n\nTHIS AMENDMENT (the “Amendment”) is made by Sterling Bancorp (the “Company”)\nand Louis J. Cappelli (“Executive”) to be effective as of December\xa029, 2008.\n\n\n\nWHEREAS, the Company and Executive are parties to an Amended and Restated Employment Agreement\ndated March\xa022, 2002, which was last amended on March\xa013, 2008 (the “Agreement”);\n\n\n\nWHEREAS, the Company and Executive desire to amend certain provisions of the Agreement in\norder to be exempt from or comply with Section\xa0409A of the Internal Revenue Code of 1986, as\namended (“Section\xa0409A”); and\n\n\n\nNOW, THEREFORE, the Agreement is hereby amended as follows:\n\n\n\n\n1.\t\xa0\tSection\xa02 of the Agreement is hereby amended by adding a new Section 2(c) as follows:\n\n“(c) will be permitted to continue to engage in activities not directly related to the\nbusiness of t

In [7]:
def clean_text(text):
    
    ## Remove puncuation
    text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    ## Remove stop words
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    
    text = " ".join(text)
    ## Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", "  ", text)
    text = re.sub(r"\-", "  ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    ## Stemming
    text = text.split()
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)
    return text 

In [10]:
length = len(ds.data)

In [11]:
final_data = []
for ix in range(length):
    text_str = clean_text(ds.data[ix])
    final_data.append(text_str)

In [12]:
len(final_data)

1000

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(final_data, ds.target, test_size=0.33)

In [15]:
X_train = np.array(X_train)
Y_train = np.array(Y_train)
X_test = np.array(X_test)
Y_test = np.array(Y_test)

In [16]:
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(670,) (670,)
(330,) (330,)


In [17]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()


In [18]:
X_train_tf = count_vect.fit_transform(X_train)

In [19]:
X_train_tf.shape

(670, 8544)

In [46]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()

In [47]:
X_train_tfidf = tfidf_transformer.fit_transform(X_train_tf)

In [22]:
X_train_tfidf.shape

(670, 8544)

In [23]:
from sklearn.naive_bayes import MultinomialNB

In [24]:
clf = MultinomialNB().fit(X_train_tfidf, Y_train)

In [25]:
X_test_tf = count_vect.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_tf)

In [26]:
prediction = clf.predict(X_test_tfidf)

In [27]:
from sklearn.metrics import accuracy_score, classification_report
print(accuracy_score(Y_test, prediction))

0.8696969696969697


In [28]:
print(classification_report(Y_test, prediction, target_names=ds.target_names))

             precision    recall  f1-score   support

          0       0.88      0.83      0.85       152
          1       0.86      0.90      0.88       178

avg / total       0.87      0.87      0.87       330



In [29]:
from sklearn.ensemble import RandomForestClassifier

In [30]:
rf = RandomForestClassifier().fit(X_train_tfidf, Y_train)

In [31]:
pred_rf = rf.predict(X_test_tfidf)

In [32]:
print( accuracy_score(Y_test, pred_rf))

0.8393939393939394


In [33]:
print(classification_report(Y_test, pred_rf, target_names=ds.target_names))

             precision    recall  f1-score   support

          0       0.82      0.84      0.83       152
          1       0.86      0.84      0.85       178

avg / total       0.84      0.84      0.84       330



In [34]:
from sklearn.svm import LinearSVC

In [35]:
svc = LinearSVC().fit(X_train_tfidf, Y_train)

In [36]:
pred_svc = svc.predict(X_test_tfidf)

In [37]:
print(accuracy_score(Y_test, pred_svc))

0.8878787878787879


In [38]:
print(classification_report(Y_test, pred_svc, target_names=ds.target_names))

             precision    recall  f1-score   support

          0       0.88      0.88      0.88       152
          1       0.90      0.89      0.90       178

avg / total       0.89      0.89      0.89       330

