# Sinlge Layer Perceptron

In [1]:
import numpy as np
from sklearn import metrics

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
# single layer perceptron
from sklearn.linear_model import Perceptron
# multilayer perceptron
from sklearn.neural_network import MLPClassifier
# handle  csv read and write
import lib.handle_csv as csvh
# library for connecting to the db
# managing files and file paths
from pathlib import Path

### Get test and training data

In [2]:
def getTrainTestData(pdf_data_search_dir, in_name):
    data_mentions, dm_headers = csvh.get_csv_data(Path(pdf_data_search_dir, in_name+'.csv'))
    corpus = []
    targets = []
    for a_line in data_mentions:
        corpus.append(data_mentions[a_line]['desc'])
        targets.append(int(data_mentions[a_line]['DataStatement']))
    # Create the training and test sets
    
    # split the dataset
    train_features, test_features, train_targets, test_targets = train_test_split(corpus,targets,test_size=0.2, random_state = 123)
    # Turn the corpus into a tf-idf array
    vectorizer = TfidfVectorizer(stop_words='english', lowercase=True, norm='l1')
    train_features = vectorizer.fit_transform(train_features)
    test_features = vectorizer.transform(test_features)
    
    return train_features, test_features, train_targets, test_targets, vectorizer

### Build the single layer perceptron

In [3]:
def buildPerceptron(train_features, train_targets):
    classifier = Perceptron(random_state=457)
    classifier.fit (train_features, train_targets)
    return classifier

#### Train and test slp perceptron

In [24]:
def buildAndTrainSLP(train_features, train_targets):
    slp_classifier = buildPerceptron(train_features, train_targets)

    predictions = slp_classifier.predict(test_features)

    score = np.round(metrics.accuracy_score(test_targets, predictions),2)

    print("SLP Mean accuracy of predictions: "+ str(score))
    return slp_classifier

### Build and test Multilayer Perceptron

In [36]:
def buildMLPerceptron(train_features, test_features, train_targets, test_targets, num_neurons = 2):
    # Build a MultiLayer perceptron and fit the data
    # Activation function: ReLU
    # Optimisation Function: Stochastic Gradient Descent (SGD)
    # Learning Rate: Inverse Scaling
    classifier = MLPClassifier(hidden_layer_sizes = num_neurons, max_iter=100, activation='relu', solver='sgd', verbose=10, random_state=762, learning_rate='invscaling')
    classifier.fit(train_features,train_targets)
    
    predictions = classifier.predict(test_features)
    
    score = np.round(metrics.accuracy_score(test_targets, predictions),2)

    print("MLP Mean accuracy of predictions: "+ str(score))
    
    return classifier


### Build classifier models using prepared dataset

In [37]:
# get train and test data

# working directory
pdf_data_search_dir = "./data_search_pdf_b"

# file containing train and test data
train_test_file =  'test_train01_c'

train_features, test_features, train_targets, test_targets, vectorizer = getTrainTestData(pdf_data_search_dir, train_test_file)


# build and train models
# single layer
slp_classifier = buildAndTrainSLP(train_features, train_targets)
# multilayer
mlp_classifier =  buildMLPerceptron(train_features, test_features, train_targets, test_targets, 2)

SLP Mean accuracy of predictions: 0.91
Iteration 1, loss = 1.23456292
Iteration 2, loss = 1.21764512
Iteration 3, loss = 1.20916983
Iteration 4, loss = 1.20550058
Iteration 5, loss = 1.20378328
Iteration 6, loss = 1.20287882
Iteration 7, loss = 1.20229993
Iteration 8, loss = 1.20187387
Iteration 9, loss = 1.20152206
Iteration 10, loss = 1.20120911
Iteration 11, loss = 1.20092391
Iteration 12, loss = 1.20065496
Iteration 13, loss = 1.20040020
Iteration 14, loss = 1.20015939
Iteration 15, loss = 1.19992610
Iteration 16, loss = 1.19970475
Iteration 17, loss = 1.19948813
Iteration 18, loss = 1.19928119
Iteration 19, loss = 1.19907924
Iteration 20, loss = 1.19888476
Iteration 21, loss = 1.19869460
Iteration 22, loss = 1.19850911
Iteration 23, loss = 1.19832928
Iteration 24, loss = 1.19815462
Iteration 25, loss = 1.19798167
Iteration 26, loss = 1.19781320
Iteration 27, loss = 1.19764875
Iteration 28, loss = 1.19748713
Iteration 29, loss = 1.19732863
Iteration 30, loss = 1.19717234
Iteration 

### Use the generated models to predict on unseen data

In [38]:
# get evalutation data
pdf_data_search_dir = "./data_search_pdf_b"

start_from = 601
stop_at = 700

eval_file = "pdf_mentions_" + str(start_from).zfill(4) + "_" + str(stop_at).zfill(4)

res_file = eval_file + "_pre_res"

eval_mentions, eval_headers = csvh.get_csv_data(Path(pdf_data_search_dir, eval_file+'.csv'))

eval_data = [eval_mentions[dic]['desc'] for dic in eval_mentions]

eval_features = vectorizer.transform(eval_data)
eval_prediction_slp = slp_classifier.predict(eval_features)
eval_prediction_mlp = mlp_classifier.predict(eval_features)

current_pub = 0
for data, slp_pred, mlp_pred in zip(eval_mentions, eval_prediction_slp, eval_prediction_mlp):        
    eval_mentions[data]['slp_pred'] = slp_pred
    eval_mentions[data]['mlp_pred'] = mlp_pred
    eval_mentions[data]['true_val'] = 0 # placeholder for manual evaluation
    # mark the first and last lines as new candidates regardles of their value
    if eval_mentions[data]['id'] != current_pub:
        if current_pub != 0 :
            eval_mentions[data]['add'] = 1
            eval_mentions[data-1]['add'] = 1
        else:
            eval_mentions[data]['add'] = 1
        current_pub = eval_mentions[data]['id']
        

csvh.write_csv_data(eval_mentions, Path(pdf_data_search_dir,  res_file +'.csv'))
    
