In [0]:
#start tika server. The Tika Server is the Parser
#java -jar "path\to\tika-server-1.22.jar"

# 1. Reading Texts and Document Cleanup

In [0]:
#import necessary modules
import tika
tika.initVM()
from tika import parser
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe"
from pdf2image import convert_from_path, convert_from_bytes
from pdf2image.exceptions import (
    PDFInfoNotInstalledError,
    PDFPageCountError,
    PDFSyntaxError)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.feature_selection import chi2
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

In [0]:
#define a parameter for tika parsers. This declaration solves the status 422 server error
headers = {'X-Tika-PDFextractInlineImages': 'true', "X-Tika-OCRLanguage": "eng"} 

In [0]:
#Import subject area dataframe
subject_df = pd.read_csv(r"parsed-subject-training-data-5.csv", encoding = "ISO-8859-1")

In [0]:
#Inspect dataframe shape
subject_df.shape

(789, 5)

In [0]:
#parse text from scanned files
def ocr_pdf(file):
    images = convert_from_path(file)
    ocr_list = [pytesseract.image_to_string(x) for x in images]
    ocr = ''
    return ocr.join(ocr_list)

In [0]:
#Run Tika Parser on Texts
def return_parsed(paths):
    try:
        return parser.from_file(paths, headers=headers)
    except:
        return 'path error'

In [0]:
#Extract Texts from Parsed Documents or OCR Documents
def return_texts(parsed, paths):
    if 'content' in parsed and parsed['content'] is not None:
        return parsed['content'] #extract 'content' from parsed texts
    else:
        try:
            return ocr_pdf(paths) #if no 'content' from tika parser, try OCRing the document
        except:
            return "no content"   

In [0]:
#Function to remove whitespaces from Text
def remove_whitespace(text):
    return text.strip()

In [0]:
#Function to remove whitespaces between comma delimeters
def remove_comma_space(text):
    try:
        return text.replace(", ", ",")  
    except:
        return text

In [0]:
#Parse Texts
subject_df = dd.from_pandas(subject_df, npartitions=5)
parsed = subject_df.apply(lambda row: return_parsed(row['Path']), axis = 1).compute()
subject_df['Parsed'] = parsed 

In [0]:
#Extract Texts
texts =  subject_df.apply(lambda row: return_texts(row['Parsed'], row['Path']), axis = 1).compute()
subject_df['Texts'] = texts
subject_df = subject_df.compute()

In [0]:
#Clean Subject_Area
Subject_Area =  subject_df.apply(lambda row: remove_comma_space(row['Subject_Area']), axis = 1)
subject_df['Subject_Area'] = Subject_Area
Subject_Area =  subject_df.apply(lambda row: remove_whitespace(row['Subject_Area']), axis = 1)
subject_df['Subject_Area'] = Subject_Area

In [0]:
#Drop rows with nan or empty values
isnan = subject_df[subject_df['Subject_Area'].isna() == True].index
subject_df.drop(isnan, inplace=True)

text_isnan = subject_df[subject_df['Texts'].isna() == True].index
subject_df.drop(text_isnan, inplace=True)

text_isempty = subject_df[subject_df['Subject_Area'] == ''].index
subject_df.drop(text_isempty, inplace=True)

no_content = subject_df[subject_df['Texts'] == 'no content'].index
subject_df.drop(no_content, inplace=True)
subject_df.head()

In [0]:
#Save Parsed Data
subject_df.to_csv(r"parsed-subject-training-data-6.csv")

# 2. Model Training and Testing

In [0]:
#Create Vectorizer Object
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

In [0]:
#Set y Values as list of sets
y = [set(i.split(',')) for i in subject_df.Subject_Area]

In [0]:
#Binarize y
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y)

In [0]:
#Check that the classes values are what is expected
mlb.classes_

array(['Contaminated Sites', 'Fish and Aquatic', 'Heritage',
       'Pollution Prevention', 'Vegetation', 'Wildlife'], dtype=object)

In [0]:
#Define X value
X = subject_df.Texts

In [0]:
#Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [0]:
#Vectorize X_train and X_test
X_train = tfidf.fit_transform(X_train).toarray()
X_test = tfidf.transform(X_test).toarray()

In [0]:
#Function to Train Classifier
def train_classifier(X_train, y_train, X_valid=None, y_valid=None, C=1.0, model='lr'):
   
    if model=='lr':
        model = LogisticRegression(C=C, penalty='l1', dual=False, solver='liblinear')
        model = OneVsRestClassifier(model)
        model.fit(X_train, y_train)
    
    elif model=='svm':
        model = LinearSVC(C=8,  dual=False, loss='squared_hinge', class_weight='balanced')
        model = OneVsRestClassifier(model)
        model.fit(X_train, y_train)
    
    elif model=='nbayes':
        model = MultinomialNB(alpha=1.0)
        model = OneVsRestClassifier(model)
        model.fit(X_train, y_train)
        
    elif model=='mlp':
        model = MLPClassifier(alpha=1.0)
        model = OneVsRestClassifier(model)
        model.fit(X_train, y_train)

    return model

In [0]:
#Train Classifier
clf = train_classifier(X, y, model = 'svm')

In [0]:
#Check Model Score
print(clf.score(X_test, y_test))

0.7721518987341772


In [0]:
#Inspect Classifier Output
y_test_predicted_labels_tfidf = clf.predict(X_test)
y_test_pred_inversed = mlb.inverse_transform(y_test_predicted_labels_tfidf)
y_test_inversed = mlb.inverse_transform(y_test)
for i in range(148):
    print('Title:\t{}\nTrue labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        X_test[i],
        ','.join(y_test_inversed[i]),
        ','.join(y_test_pred_inversed[i])
    ))

Title:	[0. 0. 0. ... 0. 0. 0.]
True labels:	Heritage
Predicted labels:	Heritage


Title:	[0.         0.         0.         ... 0.03358865 0.         0.        ]
True labels:	Pollution Prevention
Predicted labels:	Pollution Prevention


Title:	[0. 0. 0. ... 0. 0. 0.]
True labels:	Wildlife
Predicted labels:	Fish and Aquatic


Title:	[0.02712889 0.         0.         ... 0.         0.         0.        ]
True labels:	Fish and Aquatic
Predicted labels:	Fish and Aquatic


Title:	[0. 0. 0. ... 0. 0. 0.]
True labels:	Vegetation
Predicted labels:	Vegetation


Title:	[0. 0. 0. ... 0. 0. 0.]
True labels:	Wildlife
Predicted labels:	Wildlife


Title:	[0.02933709 0.01328512 0.         ... 0.         0.         0.        ]
True labels:	Fish and Aquatic,Heritage,Vegetation,Wildlife
Predicted labels:	Fish and Aquatic,Heritage,Vegetation,Wildlife


Title:	[0. 0. 0. ... 0. 0. 0.]
True labels:	Fish and Aquatic
Predicted labels:	Fish and Aquatic


Title:	[0.02357608 0.         0.         ... 0.         0.


Title:	[0. 0. 0. ... 0. 0. 0.]
True labels:	Wildlife
Predicted labels:	Wildlife


Title:	[0.00753801 0.         0.         ... 0.         0.         0.        ]
True labels:	Contaminated Sites,Pollution Prevention
Predicted labels:	Contaminated Sites,Pollution Prevention


Title:	[0.02471676 0.02694296 0.         ... 0.         0.         0.        ]
True labels:	Contaminated Sites,Pollution Prevention
Predicted labels:	Contaminated Sites


Title:	[0. 0. 0. ... 0. 0. 0.]
True labels:	Pollution Prevention
Predicted labels:	Pollution Prevention


Title:	[0.         0.         0.         ... 0.09894141 0.04168371 0.        ]
True labels:	Fish and Aquatic
Predicted labels:	Fish and Aquatic


Title:	[0.00983701 0.         0.         ... 0.         0.         0.        ]
True labels:	Pollution Prevention
Predicted labels:	Contaminated Sites,Pollution Prevention


Title:	[0. 0. 0. ... 0. 0. 0.]
True labels:	Fish and Aquatic,Pollution Prevention,Wildlife
Predicted labels:	Fish and Aquatic


T


Title:	[0. 0. 0. ... 0. 0. 0.]
True labels:	Pollution Prevention
Predicted labels:	Contaminated Sites,Pollution Prevention


Title:	[0. 0. 0. ... 0. 0. 0.]
True labels:	Heritage
Predicted labels:	Heritage


Title:	[0. 0. 0. ... 0. 0. 0.]
True labels:	Wildlife
Predicted labels:	


Title:	[0.01151724 0.         0.         ... 0.         0.         0.        ]
True labels:	Heritage
Predicted labels:	Heritage


Title:	[0.01723434 0.         0.         ... 0.         0.         0.        ]
True labels:	Fish and Aquatic
Predicted labels:	Fish and Aquatic


Title:	[0. 0. 0. ... 0. 0. 0.]
True labels:	Fish and Aquatic
Predicted labels:	Fish and Aquatic


Title:	[0. 0. 0. ... 0. 0. 0.]
True labels:	Fish and Aquatic
Predicted labels:	Fish and Aquatic


Title:	[0. 0. 0. ... 0. 0. 0.]
True labels:	Wildlife
Predicted labels:	Fish and Aquatic,Wildlife


Title:	[0.0622622 0.        0.        ... 0.        0.        0.       ]
True labels:	Fish and Aquatic
Predicted labels:	Fish and Aquatic


Title:	

In [0]:
#Predict the labels of the test data: y_pred
y_pred = clf.predict(X_test)

#Print Classification Report
print(classification_report(y_test, y_pred, target_names = mlb.classes_))

                      precision    recall  f1-score   support

  Contaminated Sites       0.86      0.57      0.69        21
    Fish and Aquatic       0.93      0.94      0.93        79
            Heritage       1.00      0.93      0.96        28
Pollution Prevention       0.78      0.88      0.82        32
          Vegetation       1.00      0.67      0.80        12
            Wildlife       0.94      0.62      0.75        24

           micro avg       0.91      0.83      0.87       196
           macro avg       0.92      0.77      0.83       196
        weighted avg       0.91      0.83      0.86       196
         samples avg       0.88      0.86      0.86       196



In [0]:
#Redefine X to include all available data
X = tfidf.fit_transform(subject_df.Texts).toarray()

In [0]:
#Re-train classifier using all available data
clf = train_classifier(X, y, model = 'svm')

In [0]:
#save trained model
model_save = r"subject-model-2.sav"
pickle.dump(clf, open(model_save, 'wb'))

In [0]:
#Open saved trained model and test
loaded_model = pickle.load(open(model_save, 'rb'))
accuracy = loaded_model.score(X_test, y_test)
print(accuracy)