In [228]:
import sklearn.datasets as skd
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
import string
from nltk.stem import WordNetLemmatizer
import numpy as np

In [229]:
categories = ['0', '1']
ds = skd.load_files("./dataset/", categories=categories, encoding="UTF-8")

In [230]:
ds.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [231]:
ds.target_names

['0', '1']

In [232]:
type(ds.data)

list

In [233]:
ds.data[0]

'\n\n\n\n\n\n\nEX-10.III.H.A\n\n\n\n\n\n\nExhibit\xa010. (iii) (H) (a)\n\n\n\n\nAMENDMENT TO AMENDED AND RESTATED EMPLOYMENT AGREEMENT\n\n\n\n\nTHIS AMENDMENT (the “Amendment”) is made by Sterling Bancorp (the “Company”)\nand Louis J. Cappelli (“Executive”) to be effective as of December\xa029, 2008.\n\n\n\nWHEREAS, the Company and Executive are parties to an Amended and Restated Employment Agreement\ndated March\xa022, 2002, which was last amended on March\xa013, 2008 (the “Agreement”);\n\n\n\nWHEREAS, the Company and Executive desire to amend certain provisions of the Agreement in\norder to be exempt from or comply with Section\xa0409A of the Internal Revenue Code of 1986, as\namended (“Section\xa0409A”); and\n\n\n\nNOW, THEREFORE, the Agreement is hereby amended as follows:\n\n\n\n\n1.\t\xa0\tSection\xa02 of the Agreement is hereby amended by adding a new Section 2(c) as follows:\n\n“(c) will be permitted to continue to engage in activities not directly related to the\nbusiness of t

In [234]:
def process(content):
    content = content.lower()
    content = content.strip('\n')
    content = content.replace('\n','')
    content = content.replace('\xa0',' ')
    content = content.strip('(')
    content = content.replace('(',"")
    content = content.strip(')')
    content = content.replace(')',"")
    
    word_list = word_tokenize(content)
    
    sw  = set(stopwords.words('english'))
    useful_words = [w for w in word_list if w not in sw]
    
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in useful_words]
    stripped = list(filter(None, stripped))
    #stripped = list(filter("\"",stripped))
    
    l = WordNetLemmatizer()
    lem_words = []
    for w in stripped:
        lem_words.append(l.lemmatize(w))
    
    return lem_words
    
    

In [235]:
length = len(ds.data)

In [236]:
text_list = []
for ix in range(length):
    lem_words = process(ds.data[ix])
    text_list.append(lem_words)
    

In [237]:
text_str = ""
final_data = []
for ix in range(length):
    text_str = "".join(text_list[ix])
    final_data.append(text_str)

In [238]:
len(final_data)

1000

In [239]:
from sklearn.model_selection import train_test_split

In [272]:
X_train, X_test, Y_train, Y_test = train_test_split(final_data, ds.target, test_size=0.33)

In [273]:
X_train = np.array(X_train)
Y_train = np.array(Y_train)
X_test = np.array(X_test)
Y_test = np.array(Y_test)

In [274]:
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(670,) (670,)
(330,) (330,)


In [275]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()


In [276]:
X_train_tf = count_vect.fit_transform(X_train)

In [277]:
X_train_tf.shape

(670, 9734)

In [278]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()

In [279]:
X_train_tfidf = tfidf_transformer.fit_transform(X_train_tf)

In [280]:
X_train_tfidf.shape

(670, 9734)

In [281]:
from sklearn.naive_bayes import MultinomialNB

In [282]:
clf = MultinomialNB().fit(X_train_tfidf, Y_train)

In [283]:
X_test_tf = count_vect.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_tf)

In [284]:
prediction = clf.predict(X_test_tfidf)

In [285]:
from sklearn.metrics import accuracy_score, classification_report
print(accuracy_score(Y_test, prediction))

0.6151515151515151


In [286]:
print(classification_report(Y_test, prediction, target_names=ds.target_names))

             precision    recall  f1-score   support

          0       0.94      0.26      0.41       168
          1       0.56      0.98      0.71       162

avg / total       0.75      0.62      0.56       330



In [287]:
from sklearn.ensemble import RandomForestClassifier

In [288]:
rf = RandomForestClassifier().fit(X_train_tfidf, Y_train)

In [289]:
pred_rf = rf.predict(X_test_tfidf)

In [290]:
print( accuracy_score(Y_test, pred_rf))

0.7666666666666667


In [291]:
print(classification_report(Y_test, pred_rf, target_names=ds.target_names))

             precision    recall  f1-score   support

          0       0.74      0.83      0.78       168
          1       0.80      0.70      0.75       162

avg / total       0.77      0.77      0.77       330



In [292]:
from sklearn.svm import LinearSVC

In [293]:
svc = LinearSVC().fit(X_train_tfidf, Y_train)

In [294]:
pred_svc = svc.predict(X_test_tfidf)

In [295]:
print(accuracy_score(Y_test, pred_svc))

0.7757575757575758


In [296]:
print(classification_report(Y_test, pred_svc, target_names=ds.target_names))

             precision    recall  f1-score   support

          0       0.78      0.78      0.78       168
          1       0.77      0.77      0.77       162

avg / total       0.78      0.78      0.78       330

