# Imports

In [16]:
import numpy as np
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Load Data

I'm not using validation data because the GridSearchCV fit method used cross-validation.

In [39]:
train_raw = pd.read_csv("../Data/Train_Data/UnProcessed/train_data_unp_17k.csv").iloc[:4355, :]
test_raw = pd.read_csv("../Data/Test_Data/un_processed/test_data_filtered.csv")
train_aug = pd.read_csv("../Data/Train_Data/UnProcessed/train_data_unp_17k.csv")
test_aug = pd.read_csv("../Data/Test_Data/un_processed/test_data_filtered.csv")
train_lemma_stop = pd.read_csv("../Data/Train_Data/pre_processed/train_data_lemmatized_sw_removed.csv")
test_lemma_stop = pd.read_csv("../Data/Test_Data/pre_processed/test_data_lemmatized_sw_removed.csv")

# Pipeline

grid.fit can take a while with this many CountVectorizer parameters

In [18]:
pipe = Pipeline([
    ("cv", CountVectorizer(lowercase=False, tokenizer=lambda s : s.split())),
    ("svm", LinearSVC())
])
param_grid = {
    "cv__ngram_range" : [(1, 1), (1, 2), (1, 3)]
}
grid = GridSearchCV(pipe, param_grid, scoring='accuracy')

# Grid Search

## Latin

### Unpreprocessed unaugmented

In [40]:
grid.fit(train_raw["la"], train_raw["file"])
print("CV results:", grid.cv_results_["mean_test_score"])
print("Best ngram range for Latin:", list(grid.best_params_.values())[0])
predictions = grid.best_estimator_.predict(test_raw["la"])
accuracy = np.mean(predictions == test_raw["file"])
print("Accuracy on Latin test set:", accuracy)



CV results: [0.85947187 0.84867968 0.83237658]
Best ngram range for Latin: (1, 1)
Accuracy on Latin test set: 0.9083333333333333


### Unpreprocessed augmented

In [37]:
grid.fit(train_aug["la"], train_aug["file"])
print("CV results:", grid.cv_results_["mean_test_score"])
print("Best ngram range for Latin:", list(grid.best_params_.values())[0])
predictions = grid.best_estimator_.predict(test_aug["la"])
accuracy = np.mean(predictions == test_aug["file"])
print("Accuracy on Latin test set:", accuracy)



CV results: [0.90826636 0.9195178  0.90482204]
Best ngram range for Latin: (1, 2)
Accuracy on Latin test set: 0.8722222222222222


### Lemmatized and stop words removed

In [22]:
grid.fit(train_lemma_stop["la"], train_lemma_stop["file"])
print("CV results:", grid.cv_results_["mean_test_score"])
print("Best ngram range for Latin:", list(grid.best_params_.values())[0])
predictions = grid.best_estimator_.predict(test_lemma_stop["la"])
accuracy = np.mean(predictions == test_lemma_stop["file"])
print("Accuracy on Latin test set:", accuracy)



CV results: [0.91713461 0.93426206 0.92284412]
Best ngram range for Latin: (1, 2)
Accuracy on Latin test set: 0.8839779005524862


## English

### Unpreprocessed unaugmented

In [41]:
grid.fit(train_raw["en"], train_raw["file"])
print("CV results:", grid.cv_results_["mean_test_score"])
print("Best ngram range for English:", list(grid.best_params_.values())[0])
predictions = grid.best_estimator_.predict(test_raw["en"])
accuracy = np.mean(predictions == test_raw["file"])
print("Accuracy on English test set:", accuracy)



CV results: [0.84845006 0.84638347 0.82433984]
Best ngram range for English: (1, 1)
Accuracy on English test set: 0.9055555555555556


### Unpreprocessed augmented

In [38]:
grid.fit(train_aug["en"], train_aug["file"])
print("CV results:", grid.cv_results_["mean_test_score"])
print("Best ngram range for English:", list(grid.best_params_.values())[0])
predictions = grid.best_estimator_.predict(test_aug["en"])
accuracy = np.mean(predictions == test_aug["file"])
print("Accuracy on English test set:", accuracy)



CV results: [0.99334099 0.99489093 0.99397245]
Best ngram range for English: (1, 2)
Accuracy on English test set: 0.8833333333333333


### Lemmatized and stop words removed

In [26]:
grid.fit(train_lemma_stop["en"], train_lemma_stop["file"])
print("CV results:", grid.cv_results_["mean_test_score"])
print("Best ngram range for English:", list(grid.best_params_.values())[0])
predictions = grid.best_estimator_.predict(test_lemma_stop["en"])
accuracy = np.mean(predictions == test_lemma_stop["file"])
print("Accuracy on English test set:", accuracy)



CV results: [0.99308037 0.99521384 0.9935993 ]
Best ngram range for English: (1, 2)
Accuracy on English test set: 0.850828729281768


In [34]:
from collections import Counter
Counter(test_raw["file"])

Counter({'final_alignments\\Vulgate_Bible.json': 323,
         'final_alignments\\Tertullian_Apology.json': 5,
         'final_alignments\\Cato_Agriculture.json': 15,
         'final_alignments\\Sallust_Jugurtha.json': 9,
         'final_alignments\\Velleius_Paterculus_Compendium.json': 8})