In [1]:
from sklearn import svm, metrics
import glob, os.path, re, json
from sklearn.externals import joblib



## Data preprocessing & Saving

In [2]:
# Read text and Count frequency
def check_freq(fname):
    name = os.path.basename(fname)
    lang = re.match(r'^[a-z]{2,}', name).group()
    
    with open(fname, 'r', encoding='utf-8') as f:
        text = f.read()
        pass
    text = text.lower()
    
    # Initialize cnt variable
    cnt = [0 for i in range(26)]
    code_a = ord('a')
    code_z = ord('z')
    
    # count the frequency of each alphabet
    for ch in text:
        n = ord(ch)
        if code_a <= n <= code_z:
            cnt[n - code_a] += 1
            pass
        pass
    
    # Normalize each element of cnt
    total = sum(cnt)
    freq = list(map(lambda n : n/total, cnt))
    
    return (freq, lang)

# Process all the files in given path
def load_files(path):
    file_list = glob.glob(path)
    freqs = []
    labels = []
    for fname in file_list:
        r = check_freq(fname)
        freqs.append(r[0])
        labels.append(r[1])
        pass
    return {'freqs':freqs, 'labels':labels}

data = load_files('pj_lang_detection/lang/train/*.txt')
test = load_files('pj_lang_detection/lang/test/*.txt')

data.keys()

dict_keys(['freqs', 'labels'])

In [3]:
# Save into json
with open('pj_lang_detection/lang/freq.json', 'w', encoding='utf-8') as fp:
    json.dump([data, test], fp)
    pass

del data
del test

## Training model with the saved data

In [4]:
# reload the saved json
with open('pj_lang_detection/lang/freq.json') as jf:
    tmp_json = json.load(jf)
    pass
data = tmp_json[0]
test = tmp_json[1]
del tmp_json

print(data.keys())
print(test.keys())

dict_keys(['freqs', 'labels'])
dict_keys(['freqs', 'labels'])


In [5]:
# Train model
clf = svm.SVC()
clf.fit(data['freqs'], data['labels'])



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [6]:
# Predict test ds
predict = clf.predict(test['freqs'])
predict

array(['en', 'en', 'fr', 'fr', 'en', 'id', 'tl', 'tl'], dtype='<U2')

In [7]:
# Evaluate the model
ac_score = metrics.accuracy_score(test['labels'], predict)
cl_report = metrics.classification_report(test['labels'], predict)
print("정답률 = ", ac_score)
print("리포트 = \n", cl_report)

정답률 =  0.875
리포트 = 
               precision    recall  f1-score   support

          en       0.67      1.00      0.80         2
          fr       1.00      1.00      1.00         2
          id       1.00      0.50      0.67         2
          tl       1.00      1.00      1.00         2

    accuracy                           0.88         8
   macro avg       0.92      0.88      0.87         8
weighted avg       0.92      0.88      0.87         8



## Training another model with hyper-parameter tuning

In [8]:
clf2 = svm.SVC(gamma=17)
clf2.fit(data['freqs'], data['labels'])
predict2 = clf2.predict(test['freqs'])

In [9]:
ac_score2 = metrics.accuracy_score(test['labels'], predict2)
cl_report2 = metrics.classification_report(test['labels'], predict2)

print("정답률 = ", ac_score2)
print("리포트 = \n", cl_report2)

정답률 =  1.0
리포트 = 
               precision    recall  f1-score   support

          en       1.00      1.00      1.00         2
          fr       1.00      1.00      1.00         2
          id       1.00      1.00      1.00         2
          tl       1.00      1.00      1.00         2

    accuracy                           1.00         8
   macro avg       1.00      1.00      1.00         8
weighted avg       1.00      1.00      1.00         8



## Training other models with various sets of gamma & C

In [10]:
for tmp_gamma in range(1,18):
    for tmp_C in [0.01, 0.1, 0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 5, 10]:
        clf2 = svm.SVC(gamma=tmp_gamma, C=tmp_C)
        clf2.fit(data['freqs'], data['labels'])
        predict2 = clf2.predict(test['freqs'])

        ac_score2 = metrics.accuracy_score(test['labels'], predict2)
        
        if ac_score2 > 0.9:
            print(f"정답률 = {ac_score2} / tmp_gamma = {tmp_gamma} / tmp_C = {tmp_C}")
            pass
        pass
    pass

정답률 = 1.0 / tmp_gamma = 2 / tmp_C = 10
정답률 = 1.0 / tmp_gamma = 3 / tmp_C = 5
정답률 = 1.0 / tmp_gamma = 4 / tmp_C = 4
정답률 = 1.0 / tmp_gamma = 4 / tmp_C = 5
정답률 = 1.0 / tmp_gamma = 4 / tmp_C = 10
정답률 = 1.0 / tmp_gamma = 5 / tmp_C = 3
정답률 = 1.0 / tmp_gamma = 5 / tmp_C = 3.5
정답률 = 1.0 / tmp_gamma = 5 / tmp_C = 4
정답률 = 1.0 / tmp_gamma = 5 / tmp_C = 10
정답률 = 1.0 / tmp_gamma = 6 / tmp_C = 2.5
정답률 = 1.0 / tmp_gamma = 6 / tmp_C = 3
정답률 = 1.0 / tmp_gamma = 6 / tmp_C = 3.5
정답률 = 1.0 / tmp_gamma = 6 / tmp_C = 10
정답률 = 1.0 / tmp_gamma = 7 / tmp_C = 2.5
정답률 = 1.0 / tmp_gamma = 7 / tmp_C = 3
정답률 = 1.0 / tmp_gamma = 7 / tmp_C = 5
정답률 = 1.0 / tmp_gamma = 7 / tmp_C = 10
정답률 = 1.0 / tmp_gamma = 8 / tmp_C = 2
정답률 = 1.0 / tmp_gamma = 8 / tmp_C = 2.5
정답률 = 1.0 / tmp_gamma = 8 / tmp_C = 5
정답률 = 1.0 / tmp_gamma = 8 / tmp_C = 10
정답률 = 1.0 / tmp_gamma = 9 / tmp_C = 2
정답률 = 1.0 / tmp_gamma = 9 / tmp_C = 2.5
정답률 = 1.0 / tmp_gamma = 9 / tmp_C = 4
정답률 = 1.0 / tmp_gamma = 9 / tmp_C = 5
정답률 = 1.0 / tmp_gamma = 9 / tmp_

## Saving the final model

In [11]:
clf3 = svm.SVC(gamma=5, C=3)
clf3.fit(data['freqs'], data['labels'])

joblib.dump(clf3, "pj_lang_detection/lang/freq.pkl")

['pj_lang_detection/lang/freq.pkl']