In [81]:
import numpy as np

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split

from sklearn.metrics import precision_recall_fscore_support


with open("dutch.txt") as f:
    dutchString = " ".join(f.read().splitlines())

with open("english.txt") as f:
    englishString = " ".join(f.read().splitlines())


with open("dutchTestSet.txt") as f:
    dutchTestStr = " ".join(f.read().splitlines())

with open("englishTestSet.txt") as f:
    englishTestStr = " ".join(f.read().splitlines())

In [82]:
 # turn the english and dutch words into a dict that counts the amount of characters in each sentence

def countLetters(words):
    # returns an np array (length of 26) where each index refers to the letter count
    counts = dict()
    rc = np.zeros(26)
    for character in words.lower():
        if (97 <= ord(character) <= 122):
            i = ord(character) - ord('a')
            if (0 <= i <= 26):
                rc[i] += 1
    return rc


In [83]:
# this will make an X for both english and dutch

def make_X_and_y(text, language):

    length = text.count('.') + 1
    
    # ENGLISH MEANS y[i] = 1!!!!!!
    if (language == "english"): y = np.ones(length)
    else: y = np.full(length, -1)

    testArray = list()

    i = 0
    X = np.zeros(length)
    for sentence in text.split('.'):
        temp = countLetters(sentence)
        testArray.append(temp)
        i += 1
        
    X = np.vstack(testArray)

    return X, y

In [84]:
X_e, y_e = make_X_and_y(englishString, "english")

X_d, y_d = make_X_and_y(dutchString, "dutch")

X = np.concatenate((X_e, X_d))
y = np.concatenate((y_e, y_d))

In [85]:
def unison_shuffled_copies(a, b):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]

# X,  y = unison_shuffled_copies(X, y)

In [86]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

In [87]:
classifier = LinearSVC(C=1.0)

In [88]:
classifier.fit(X_train, y_train)

In [90]:
y_pred = classifier.predict(X_test)


precision, recall, f1Score, _ = precision_recall_fscore_support(
    y_test, y_pred, labels=[1, -1])
english_precision, english_recall, english_f1 = precision[0], recall[0], f1Score[0]
dutch_precision,   dutch_recall,   dutch_f1   = precision[1], recall[1], f1Score[1]


print(" SVM:           precision    recall    f1-score")
print(f"{'English(1)':}       {english_precision:3f}    {english_recall:3f}    {english_f1:3f}")
print(f"{'Dutch(-1)':}       {dutch_precision:3f}    {dutch_recall:3f}    {dutch_f1:3f}")
print(f"Accuracy: {classifier.score(X_test, y_test):4f}")

 SVM:           precision    recall    f1-score
English(1)       1.000000    1.000000    1.000000
Dutch(-1)       1.000000    1.000000    1.000000
Accuracy: 1.000000
