# Sinlge Layer Perceptron

In [7]:
import numpy as np
from sklearn import metrics

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
# single layer perceptron
from sklearn.linear_model import Perceptron
# multilayer perceptron
from sklearn.neural_network import MLPClassifier
# handle  csv read and write
import lib.handle_csv as csvh
# library for connecting to the db
# managing files and file paths
from pathlib import Path

### Get test and training data

In [8]:
def getTrainTestData(pdf_data_search_dir, in_name):
    data_mentions, dm_headers = csvh.get_csv_data(Path(pdf_data_search_dir, in_name+'.csv'))
    corpus = []
    targets = []
    for a_line in data_mentions:
        corpus.append(data_mentions[a_line]['desc'])
        targets.append(int(data_mentions[a_line]['DataStatement']))
    # Create the training and test sets
    
    # split the dataset
    train_features, test_features, train_targets, test_targets = train_test_split(corpus,targets,test_size=0.2, random_state = 123)
    # Turn the corpus into a tf-idf array
    vectorizer = TfidfVectorizer(stop_words='english', lowercase=True, norm='l1')
    train_features = vectorizer.fit_transform(train_features)
    test_features = vectorizer.transform(test_features)
    
    return train_features, test_features, train_targets, test_targets, vectorizer

### Build the single layer perceptron

In [9]:
def buildPerceptron(train_features, train_targets):
    classifier = Perceptron(random_state=457)
    classifier.fit (train_features, train_targets)
    return classifier

### Train and test perceptron

In [10]:
def buildAndTrainSLP(train_features, train_targets):
    slp_classifier = buildPerceptron(train_features, train_targets)

    predictions = slp_classifier.predict(test_features)

    score = np.round(metrics.accuracy_score(test_targets, predictions),2)

    print("Mean accuracy of predictions: "+ str(score))
    #print(list(predictions))
    #print(test_targets)
    #print(list(predictions-test_targets))
    return slp_classifier

### Use the generated model to predict on unseen data

In [53]:
# get train and test data

# working directory
pdf_data_search_dir = "./data_search_pdf_b"

# file containing train and test data
train_test_file =  'test_train01_a'

train_features, test_features, train_targets, test_targets, vectorizer = getTrainTestData(pdf_data_search_dir, train_test_file)


# build and train model
slp_classifier = buildAndTrainSLP(train_features, train_targets)
# get evalutation data
pdf_data_search_dir = "./data_search_pdf_b"

start_from = 201
stop_at = 250

eval_file = "pdf_mentions_" + str(start_from).zfill(4) + "_" + str(stop_at).zfill(4)

res_file = eval_file + "_pre_res"

eval_mentions, eval_headers = csvh.get_csv_data(Path(pdf_data_search_dir, eval_file+'.csv'))

eval_data = [eval_mentions[dic]['desc'] for dic in eval_mentions]

eval_features = vectorizer.transform(eval_data)
eval_prediction = slp_classifier.predict(eval_features)

current_pub = 0
for data, slc_pred in zip(eval_mentions, eval_prediction):        
    eval_mentions[data]['predicted'] = slc_pred
    eval_mentions[data]['true_val'] = 0 # placeholder for manual evaluation
    # mark the first and last lines as new candidates regardles of their value
    if eval_mentions[data]['id'] != current_pub:
        if current_pub != 0 :
            eval_mentions[data]['add'] = 1
            eval_mentions[data-1]['add'] = 1
        else:
            eval_mentions[data]['add'] = 1
        current_pub = eval_mentions[data]['id']
        

csvh.write_csv_data(eval_mentions, Path(pdf_data_search_dir,  res_file +'.csv'))
    


Mean accuracy of predictions: 0.85


# Multilayer Perceptron

The MLP uses the same training and test sets.

In [54]:
def buildMLPerceptron(train_features, test_features, train_targets, test_targets, num_neurons = 2):
    # Build a MultiLayer perceptron and fit the data
    # Activation function: ReLU
    # Optimisation Function: Stochastic Gradient Descent (SGD)
    # Learning Rate: Inverse Scaling
    classifier = MLPClassifier(hidden_layer_sizes = num_neurons, max_iter=35, activation='relu', solver='sgd', verbose=10, random_state=762, learning_rate='invscaling')
    classifier.fit(train_features,train_targets)
    
    return classifier


In [55]:
mlp_classifier =  buildMLPerceptron(train_features, test_features, train_targets, test_targets, 2)

predictions = mlp_classifier.predict(test_features)
    
score = np.round(metrics.accuracy_score(test_targets, predictions),5)

print("Mean accuracy of predictions: "+ str(score))
print(predictions)
print(test_targets)
print(predictions-test_targets)

Iteration 1, loss = 0.94429406
Iteration 2, loss = 0.94340443
Iteration 3, loss = 0.94272826
Iteration 4, loss = 0.94222274
Iteration 5, loss = 0.94181200
Iteration 6, loss = 0.94151745
Iteration 7, loss = 0.94128288
Iteration 8, loss = 0.94108928
Iteration 9, loss = 0.94093606
Iteration 10, loss = 0.94080481
Iteration 11, loss = 0.94070191
Iteration 12, loss = 0.94061382
Iteration 13, loss = 0.94053808
Iteration 14, loss = 0.94047089
Iteration 15, loss = 0.94041199
Iteration 16, loss = 0.94035830
Iteration 17, loss = 0.94030878
Iteration 18, loss = 0.94026430
Iteration 19, loss = 0.94022032
Iteration 20, loss = 0.94018055
Iteration 21, loss = 0.94014116
Iteration 22, loss = 0.94010466
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Mean accuracy of predictions: 0.48252
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 