# Imports

In [16]:
import numpy as np
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Load Data

I'm not using validation data because the GridSearchCV fit method used cross-validation.

In [17]:
train_raw = pd.read_csv("../Data/Train_Data/UnProcessed/train_data_unprocessed.csv")
test_raw = pd.read_csv("../Data/Test_Data/un_processed/test_data_filtered.csv")
train_lemma = pd.read_csv("../Data/Train_Data/pre_processed/train_data_lemmatized.csv")
test_lemma = pd.read_csv("../Data/Test_Data/pre_processed/test_data_lemmatized.csv")
train_stop = pd.read_csv("../Data/Train_Data/pre_processed/train_data_sw_removed.csv")
test_stop = pd.read_csv("../Data/Test_Data/pre_processed/test_data_sw_removed.csv")
train_lemma_stop = pd.read_csv("../Data/Train_Data/pre_processed/train_data_lemmatized_sw_removed.csv")
test_lemma_stop = pd.read_csv("../Data/Test_Data/pre_processed/test_data_lemmatized_sw_removed.csv")

# Pipeline

grid.fit can take a while with this many CountVectorizer parameters

In [18]:
pipe = Pipeline([
    ("cv", CountVectorizer(lowercase=False, tokenizer=lambda s : s.split())),
    ("svm", LinearSVC())
])
param_grid = {
    "cv__ngram_range" : [(1, 1), (1, 2), (1, 3)]
}
grid = GridSearchCV(pipe, param_grid, scoring='accuracy')

# Grid Search

## Latin

### Unpreprocessed

In [19]:
grid.fit(train_raw["la"], train_raw["file"])
print("CV results:", grid.cv_results_["mean_test_score"])
print("Best ngram range for Latin:", list(grid.best_params_.values())[0])
predictions = grid.best_estimator_.predict(test_raw["la"])
accuracy = np.mean(predictions == test_raw["file"])
print("Accuracy on Latin test set:", accuracy)



CV results: [0.90808203 0.91926907 0.90479482]
Best ngram range for Latin: (1, 2)
Accuracy on Latin test set: 0.8722222222222222


### Lemmatized

In [20]:
grid.fit(train_lemma["la"], train_lemma["file"])
print("CV results:", grid.cv_results_["mean_test_score"])
print("Best ngram range for Latin:", list(grid.best_params_.values())[0])
predictions = grid.best_estimator_.predict(test_lemma["la"])
accuracy = np.mean(predictions == test_lemma["file"])
print("Accuracy on Latin test set:", accuracy)



CV results: [0.92019045 0.93899026 0.92987898]
Best ngram range for Latin: (1, 2)
Accuracy on Latin test set: 0.9088397790055248


### Stop words removed

In [21]:
grid.fit(train_stop["la"], train_stop["file"])
print("CV results:", grid.cv_results_["mean_test_score"])
print("Best ngram range for Latin:", list(grid.best_params_.values())[0])
predictions = grid.best_estimator_.predict(test_stop["la"])
accuracy = np.mean(predictions == test_stop["file"])
print("Accuracy on Latin test set:", accuracy)



CV results: [0.90404594 0.90917821 0.89689512]
Best ngram range for Latin: (1, 2)
Accuracy on Latin test set: 0.8535911602209945


### Lemmatized and stop words removed

In [22]:
grid.fit(train_lemma_stop["la"], train_lemma_stop["file"])
print("CV results:", grid.cv_results_["mean_test_score"])
print("Best ngram range for Latin:", list(grid.best_params_.values())[0])
predictions = grid.best_estimator_.predict(test_lemma_stop["la"])
accuracy = np.mean(predictions == test_lemma_stop["file"])
print("Accuracy on Latin test set:", accuracy)



CV results: [0.91713461 0.93426206 0.92284412]
Best ngram range for Latin: (1, 2)
Accuracy on Latin test set: 0.8839779005524862


## English

### Unpreprocessed

In [23]:
grid.fit(train_raw["en"], train_raw["file"])
print("CV results:", grid.cv_results_["mean_test_score"])
print("Best ngram range for English:", list(grid.best_params_.values())[0])
predictions = grid.best_estimator_.predict(test_raw["en"])
accuracy = np.mean(predictions == test_raw["file"])
print("Accuracy on English test set:", accuracy)



CV results: [0.9935992  0.99590577 0.99486791]
Best ngram range for English: (1, 2)
Accuracy on English test set: 0.8527777777777777


### Lemmatized

In [24]:
grid.fit(train_lemma["en"], train_lemma["file"])
print("CV results:", grid.cv_results_["mean_test_score"])
print("Best ngram range for English:", list(grid.best_params_.values())[0])
predictions = grid.best_estimator_.predict(test_lemma["en"])
accuracy = np.mean(predictions == test_lemma["file"])
print("Accuracy on English test set:", accuracy)



CV results: [0.99308018 0.99607878 0.99504087]
Best ngram range for English: (1, 2)
Accuracy on English test set: 0.8784530386740331


### Stop words removed

In [25]:
grid.fit(train_stop["en"], train_stop["file"])
print("CV results:", grid.cv_results_["mean_test_score"])
print("Best ngram range for English:", list(grid.best_params_.values())[0])
predictions = grid.best_estimator_.predict(test_stop["en"])
accuracy = np.mean(predictions == test_stop["file"])
print("Accuracy on English test set:", accuracy)



CV results: [0.99354168 0.99509859 0.993311  ]
Best ngram range for English: (1, 2)
Accuracy on English test set: 0.8453038674033149


### Lemmatized and stop words removed

In [26]:
grid.fit(train_lemma_stop["en"], train_lemma_stop["file"])
print("CV results:", grid.cv_results_["mean_test_score"])
print("Best ngram range for English:", list(grid.best_params_.values())[0])
predictions = grid.best_estimator_.predict(test_lemma_stop["en"])
accuracy = np.mean(predictions == test_lemma_stop["file"])
print("Accuracy on English test set:", accuracy)



CV results: [0.99308037 0.99521384 0.9935993 ]
Best ngram range for English: (1, 2)
Accuracy on English test set: 0.850828729281768
