In [1]:
import os
import pandas as pd
import numpy as np
import char_tokens
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,f1_score
from sklearn.utils import shuffle

In [2]:
#Globals
test_size = 250
bpath=os.getcwd() + r"\\NLI_PT_v3\NLI_PT_v3\student\\"

In [3]:
def text_file_opener(file_name):
    with open(file_name,'r',encoding='utf-8') as f:
        return f.read().replace('\n',' ')

def parallel_extract():
    texts = [ (text_file_opener(bpath +str(file))) for file in os.listdir(bpath)]
    target = [language.split('_')[0] for language in os.listdir(bpath)]
    return [texts,target]

df = pd.DataFrame(data = list(map(list, zip(*parallel_extract()))),columns=["text","lang"])

print("\nNumber of documents: " + str(df.shape[0]))
print("\nStatistics of size of documents (by number of characters): " )
print(str((df["text"].str.len()).describe()))



Number of documents: 3069

Statistics of size of documents (by number of characters): 
count     3069.000000
mean      1045.513196
std        818.494866
min         23.000000
25%        598.000000
50%        923.000000
75%       1336.000000
max      19215.000000
Name: text, dtype: float64


In [4]:
df = df.groupby("lang").filter(lambda x: len(x) > 106)

df = df.drop_duplicates(subset="text")

#Uses a token for names that were replaced in the texts with "XXXXX"
df["text"] = df["text"].str.replace("XXXXX", "縵")

df = shuffle(df)

print(df["lang"].value_counts())

spa    614
ita    565
chi    445
ger    438
eng    415
Name: lang, dtype: int64


In [5]:
tokenizer = char_tokens.NGramTokenizer(ngrams=3)
tokenizer.text_fitting(df["text"])

print("\nInput layer size: " + str(len(tokenizer.ngram_dict)) + " features.")


Input layer size: 27685 features.


In [6]:
tfidf = np.array(tokenizer.tf_idf(df["text"],'binary','idf'))
df = df.drop(columns=["text"])

In [7]:
clf = SVC(tol=1e-5,max_iter=-1,verbose=1,class_weight="balanced",gamma="scale",kernel='linear')
clf.fit(tfidf[:-1*test_size],df["lang"].iloc[:-1*test_size])




[LibSVM]

SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=1e-05, verbose=1)

In [8]:
cm = confusion_matrix(df["lang"].iloc[-1*test_size:],clf.predict(tfidf[-1*test_size:]),labels=["spa","ita","chi","ger","eng"])
print("\nAccuracy (true positives/test set absolute size): ")
print(np.trace(cm)/test_size)
print("\nConfusion Matrix: ")
print(cm)
print('\nF1 Score:')
print(f1_score(df["lang"].iloc[-1*test_size:],clf.predict(tfidf[-1*test_size:]),labels=["spa","ita","chi","ger","eng"],average='weighted'))


Accuracy (true positives/test set absolute size): 
0.704

Confusion Matrix: 
[[42  7  2  4  3]
 [12 38  2  7  1]
 [ 2  1 37  2  0]
 [ 4  4  2 30  4]
 [ 4  4  6  3 29]]

F1 Score:
0.7022647361197628
