In [23]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import time
import json
import os

In [5]:
def DataAggregator(path):
    text_dict = {}
    json_dict = {}
    label_df = pd.DataFrame(columns=['changes', 'positions','file'])

    for file in os.listdir(path):
        if file.endswith('.txt'):
            with open(path+file, 'r', encoding='utf-8') as myfile:
                text_dict[file.replace('problem-', '').replace('.txt', '')] = myfile.read().replace('\n', '')
        else:
            with open(path+file, 'r', encoding='utf-8') as myfile:
                data = json.load(myfile)
                json_dict[file.replace('problem-', '').replace('.truth', '')] = data['changes']

    df = pd.DataFrame(list(text_dict.items()), columns=['file', 'text'])
    label_df = pd.DataFrame(list(json_dict.items()), columns = ['file', 'changes'])

    output_df = df.merge(label_df,on='file')
    output_df = output_df.drop('file', axis = 1)
    return output_df

In [6]:
#Convert the data sets to data frames
test_df = DataAggregator('C:/Users/RSC/PyProjects/Data-Science/Data/pan18-style-change-detection-test-dataset-2018-01-31/')
training_df = DataAggregator('C:/Users/RSC/PyProjects/Data-Science/Data/pan18-style-change-detection-training-dataset-2018-01-31/')
validation_df = DataAggregator('C:/Users/RSC/PyProjects/Data-Science/Data/pan18-style-change-detection-validation-dataset-2018-01-31/')


In [7]:
#Split the data into text and labels
training_labels = training_df['changes']
validation_labels = validation_df['changes']
test_labels = test_df['changes']

training_text = list(training_df['text'])
validation_text = list(validation_df['text'])
test_text = list(test_df['text'])

print("Training Text: " + str(len(training_text)) + ", Training Labels: " + str(len(training_labels)))
print("Test Text: " + str(len(test_text)) + ", Test Labels: " + str(len(test_labels)))
print("Validation Text: " + str(len(validation_text)) + ", Validation Labels: " + str(len(validation_labels)))

Training Text: 2980 Training Labels 2980
Test Text: 1352 Test Labels 1352
Validation Text: 1492 Validation Labels 1492


In [27]:
#Builds a pipeline
NB_pipeline = Pipeline([('tfidf', TfidfVectorizer(strip_accents = 'ascii', ngram_range=(1,1), analyzer='word', stop_words='english')),
                        ('nb_clf', MultinomialNB())])

SVC_pipeline = Pipeline([('tfidf', TfidfVectorizer(strip_accents = 'ascii', ngram_range=(1,5), analyzer='word', stop_words='english')),
                        ('svc_clf', SVC())])


In [28]:
#Train the models and fit to the validation set
start = time.time()
NB_pipeline.fit(training_text, training_labels)
nb_prediction = NB_pipeline.predict(validation_text)
print("Naive Bayes Accuracy: " + str(accuracy_score(nb_prediction, validation_labels)) + " -- Run time " + str(time.time()-start))

start = time.time()
SVC_pipeline.fit(training_text, training_labels)
svc_prediction = SVC_pipeline.predict(validation_text)
print("SVM Accuracy: " + str(accuracy_score(svc_prediction, validation_labels)) + " -- Run time " + str(time.time()-start))

Naive Bayes Accuracy: 0.536193029491 -- Run time 2.8715779781341553
SVM Accuracy: 0.658847184987 -- Run time 122.94201683998108
