## Document classification

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
import magic

In [3]:
import os

In [4]:
filename=[]
classes=[]
filetype=[]
magic_type=[]

In [5]:
file_support=['.py','.pdf','.ics','.html','.ics','.vcs','.xlsx','.xls','.doc','.docx','.txt','.pptx','.ppt']

In [6]:
for root, directories, filenames in os.walk('./data/'):
     for file in filenames:
            if file != ".DS_Store":
                file_name, file_extension = os.path.splitext(file)
                filename.append("{}/{}".format(root,file))
                classes.append(root.split("/")[-1])
                filetype.append(file_extension) 
                magic_type.append(magic.from_file("{}/{}".format(root,file), mime=True))

In [7]:
df=pd.DataFrame()

In [8]:
df['file']=filename
df['type']=filetype
df['magic_type']=magic_type
df['classes']=classes

In [9]:
df.magic_type.unique()

array(['message/news', 'text/plain', 'text/x-fortran', 'message/rfc822',
       'application/octet-stream', 'text/x-c', 'text/x-makefile',
       'text/x-c++', 'text/x-lisp', 'application/pdf', 'text/calendar',
       'text/x-python', 'application/zip', 'application/vnd.ms-excel',
       'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
       'inode/x-empty', 'application/msword', 'application/x-bzip2',
       'application/vnd.ms-powerpoint',
       'application/vnd.openxmlformats-officedocument.presentationml.presentation',
       'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
       'text/html', 'video/webm'], dtype=object)

In [10]:
df.head()

Unnamed: 0,file,type,magic_type,classes
0,./data/20_newsgroup/alt.atheism/53341,,message/news,alt.atheism
1,./data/20_newsgroup/alt.atheism/49960,,message/news,alt.atheism
2,./data/20_newsgroup/alt.atheism/51060,,message/news,alt.atheism
3,./data/20_newsgroup/alt.atheism/51119,,text/plain,alt.atheism
4,./data/20_newsgroup/alt.atheism/51120,,message/news,alt.atheism


In [11]:
df[df['magic_type']=='application/vnd.openxmlformats-officedocument.presentationml.presentation']['file'].values

array(['./data/Presentations/renepal-infosession.pptx',
       './data/Presentations/Search.pptx'], dtype=object)

## PDF parser

In [12]:
from PyPDF2 import PdfFileWriter, PdfFileReader

In [13]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO

def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text

In [14]:
txt=convert_pdf_to_txt(df['file'][0])

PDFSyntaxError: No /Root object! - Is this really a PDF?

## Docx parser

In [None]:
import docx2txt
my_text = docx2txt.process(df[df['type']=='.docx']['file'].values[0])
print(my_text)

## Doc parser

In [None]:
import textract
txt_doc = textract.process(df[df['type']=='.doc']['file'].values[0], method='antiword')

In [None]:
txt_doc

## PPTX parser

In [None]:
txt_ppt = textract.process(df[df['type']=='.pptx']['file'].values[0], method='python-pptx')

In [None]:
txt_ppt

## PPT parser

## Classification

In [4]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import collections
from tqdm import tqdm
import re
import os

In [6]:
newsgroups_train.target[0]

7

In [7]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors.shape

(11314, 130107)

In [8]:
vectors.nnz / float(vectors.shape[0])

157.9958458546933

In [9]:
newsgroups_test = fetch_20newsgroups(subset='test')

In [10]:
vectors_test = vectorizer.transform(newsgroups_test.data)
clf = MultinomialNB(alpha=.01)
clf.fit(vectors, newsgroups_train.target)
pred = clf.predict(vectors_test)
metrics.f1_score(newsgroups_test.target, pred, average='macro')

0.82906596444740432

In [11]:
metrics.accuracy_score(newsgroups_test.target, pred)

0.83523632501327671

In [12]:
# Create a text cleaning function
def clean_text(text_string):
    text_string = re.sub(r'([^\s\w]|_|[0-9])+', '', str(text_string))
    #text_string = " ".join(text_string.split())
    text_string = text_string.lower()
    return(text_string)

In [13]:
def tokenizer(text):
    text = [document.lower().replace('\n', '').split() for document in text]
    return text

In [14]:
text_data_train=[]
for data in newsgroups_train.data:
    text_data_train.append(clean_text(data))

In [15]:
sentences = ' '.join(text_data_train)

In [16]:
text_data_test=[]
for data in newsgroups_test.data:
    text_data_test.append(clean_text(data))

In [20]:
len(text_data_test)

7532

In [17]:
vectors = vectorizer.fit_transform(text_data_train)
vectors.shape

(11314, 118020)

In [18]:
vectors_test = vectorizer.transform(text_data_test)
clf = MultinomialNB(alpha=.01)
clf.fit(vectors, newsgroups_train.target)
pred = clf.predict(vectors_test)
metrics.f1_score(newsgroups_test.target, pred, average='macro')

0.83871602964644809

In [19]:
metrics.accuracy_score(newsgroups_test.target, pred)

0.84599044078597985

## Doc2Vec

In [21]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from os import path
from random import shuffle

Using TensorFlow backend.


In [22]:
from nltk.tokenize import word_tokenize

In [23]:
google_news_word2vec_model_location = 'data/GoogleNews-vectors-negative300.bin.gz'
doc2vec_model_location = 'model/doc2vec-model.bin'
doc2vec_vectors_location = 'model/doc2vec-vectors.bin'
doc2vec_dimensions = 300
classifier_model_location = 'model/classifier-model.bin'

In [106]:
doc=[]
filename=[]
classes=[]
filetype=[]
magic_type=[]

In [107]:
for root, directories, filenames in os.walk('./20_newsgroup/'):
     for file in filenames:
            if file != ".DS_Store":
                file_name, file_extension = os.path.splitext(file)
                filename.append("{}/{}".format(root,file))
                classes.append(root.split("/")[-1])
                # filetype.append(file_extension) 
                magic_type.append(magic.from_file("{}/{}".format(root,file), mime=True))
                file1 = open("{}/{}".format(root,file),"r",encoding='utf-8', errors='ignore')
                doc.append(file1.read())

In [108]:
len(classes)

13736

In [109]:
len(doc)

13736

In [110]:
df=pd.DataFrame()
df['doc']=doc
df['classes']=classes
df['file']=filename
#df['type']=filetype
df['magic_type']=magic_type

In [111]:
df.head()

Unnamed: 0,doc,classes,file,magic_type
0,Xref: cantaloupe.srv.cs.cmu.edu talk.abortion:...,alt.atheism,./20_newsgroup/alt.atheism/53341,message/news
1,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:49...,alt.atheism,./20_newsgroup/alt.atheism/49960,message/news
2,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...,alt.atheism,./20_newsgroup/alt.atheism/51060,message/news
3,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism,./20_newsgroup/alt.atheism/51119,text/plain
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...,alt.atheism,./20_newsgroup/alt.atheism/51120,message/news


In [112]:
all_data=df['doc'].values
len(all_data)

13736

In [113]:
# Load the reuters news articles and convert them to TaggedDocuments
taggedDocuments = [TaggedDocument(words=word_tokenize(fileId), tags=[i]) for i, fileId in enumerate(all_data)]
shuffle(taggedDocuments)

In [114]:
taggedDocuments[0]

TaggedDocument(words=['Xref', ':', 'cantaloupe.srv.cs.cmu.edu', 'comp.multimedia:6543', 'comp.graphics:38390', 'Path', ':', 'cantaloupe.srv.cs.cmu.edu', '!', 'das-news.harvard.edu', '!', 'ogicse', '!', 'usenet.ee.pdx.edu', '!', 'pdxgate', '!', 'rigel', '!', 'idr', 'From', ':', 'idr', '@', 'rigel.cs.pdx.edu', '(', 'Ian', 'D', 'Romanick', ')', 'Newsgroups', ':', 'comp.multimedia', ',', 'comp.graphics', 'Subject', ':', 'Re', ':', 'Rumours', 'about', '3DO', '?', '?', '?', 'Message-ID', ':', '<', '7272', '@', 'pdxgate.UUCP', '>', 'Date', ':', '17', 'Apr', '93', '20:54:46', 'GMT', 'Article-I.D', '.', ':', 'pdxgate.7272', 'References', ':', '<', '1993Mar31.074502.3590', '@', 'aragorn.unibe.ch', '>', '<', '1993Apr15.143444.32980', '@', 'rchland.ibm.com', '>', '<', '1993Apr15.143444.32980', '@', 'rchland.ibm.com', '>', '<', '1993Apr15.164940.11632', '@', 'mercury.unt.edu', '>', 'Sender', ':', 'news', '@', 'pdxgate.UUCP', 'Organization', ':', 'Portland', 'State', 'University', ',', 'Computer', '

In [115]:
# Create and train the doc2vec model
doc2vec = Doc2Vec(size=doc2vec_dimensions, min_count=2, iter=10, workers=12)

In [116]:
# Build the word2vec model from the corpus
doc2vec.build_vocab(taggedDocuments)

In [117]:
doc2vec.train(taggedDocuments,total_examples=doc2vec.corpus_count,epochs=doc2vec.iter)

39235530

In [84]:
doc2vec.save("doc2vec_new")

In [85]:
doc2vec_model_location = 'doc2vec_new'
doc2vec_dimensions = 300

doc2vec1 = Doc2Vec.load(doc2vec_model_location)

In [86]:
classifier_model_location = 'classifier'

In [87]:
from sklearn.preprocessing.label import MultiLabelBinarizer

In [89]:
df.head()

Unnamed: 0,doc,target
0,\n\nI am sure some bashers of Pens fans are pr...,10
1,My brother is in the market for a high-perform...,3
2,\n\n\n\n\tFinally you said what you dream abou...,17
3,\nThink!\n\nIt's the SCSI card doing the DMA t...,3
4,1) I have an old Jasmine drive which I cann...,4


In [90]:
y=pd.get_dummies(df['classes'].values)

In [91]:
X_train, X_test, y_train, y_test = train_test_split(df['doc'].values,y.values , test_size=0.33, random_state=42)

In [92]:
# Convert the categories to one hot encoded categories
labelBinarizer = MultiLabelBinarizer()
labelBinarizer.fit([df['classes'].values])

MultiLabelBinarizer(classes=None, sparse_output=False)

In [93]:
# Convert the articles to document vectors using the doc2vec model
train_data = [doc2vec1.infer_vector(word_tokenize(article)) for article in X_train]
test_data = [doc2vec1.infer_vector(word_tokenize(article)) for article in X_test]

In [94]:
train_data, test_data, train_labels, test_labels = np.asarray(train_data), np.asarray(test_data), np.asarray(y_train), np.asarray(y_test)

In [95]:
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.optimizers import Adam

In [96]:
# Initialize the neural network
model = Sequential()
model.add(Dense(input_dim=doc2vec_dimensions, output_dim=500, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(output_dim=1200, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(output_dim=400, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(output_dim=600, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(output_dim=train_labels.shape[1], activation='sigmoid'))
model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

  app.launch_new_instance()


In [97]:
# Saves the model with highest score after each training cycle
checkpointer = ModelCheckpoint(filepath=classifier_model_location, verbose=1, save_best_only=True)

# Train the neural network
model.fit(train_data, train_labels, validation_data=(test_data, test_labels), batch_size=32, nb_epoch=15, callbacks=[checkpointer])



Train on 12626 samples, validate on 6220 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7fe5bca11908>