In [277]:
import numpy as np
import sklearn
from sklearn import linear_model
import polars
from sklearn.model_selection import train_test_split
from scipy import sparse

In [278]:
df = polars.read_csv("train.csv", n_rows=300000)
df.shape

(299999, 2050)

In [272]:
from collections import Counter
counter = Counter()

c = 0
for line in open("dataset/archive/sentences.prepared.csv"):
    (rid, lang, sentence) = line.strip().split("\t", 2)
    if lang not in {"fra","eng", "ita", "deu", "esp", "por"}:
        continue
    c += 1
    if c > 100_000:
        break
    for chr in sentence:
        if ord(chr) > 128:
            counter[chr] += 1
letters = sorted(ord(letter) for (letter, count) in counter.most_common(100) if count >= 10)
print(", ".join(map(str, letters)))
#print(letters)

160, 171, 173, 187, 192, 196, 199, 200, 201, 202, 205, 214, 220, 223, 224, 225, 226, 227, 228, 231, 232, 233, 234, 235, 236, 237, 238, 239, 242, 243, 244, 245, 246, 249, 250, 251, 252, 333, 339, 8201, 8211, 8212, 8217, 8220, 8221, 8222, 8239


In [279]:
y = df[:, 1].to_numpy()
X = sparse.csr_matrix(df[:, 2:].to_numpy())
idx = df[:, 0]
del df
X = sklearn.preprocessing.normalize(X)
(X_train, X_test, y_train, y_test) = train_test_split(X, y, shuffle=False)
n_train = X_train.shape[0]
print(X_train.shape, X_test.shape)

(224999, 2048) (75000, 2048)


In [285]:
for C in [16]:
    model = sklearn.linear_model.LogisticRegression(max_iter=100, penalty='l2', C=C, verbose=1, class_weight='balanced') #, l1_ratio=0.1,) # penalty='elasticnet', solver='saga') 
    model.fit(X_train, y_train)
    print(C)
    print((model.predict(X_train) == y_train).mean())
    print((model.predict(X_test) == y_test).mean())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =        32784     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  6.23830D+05    |proj g|=  1.22401D+04


 This problem is unconstrained.



At iterate   50    f=  7.62539D+03    |proj g|=  7.99568D+01

At iterate  100    f=  5.25869D+03    |proj g|=  4.11683D+01

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
32784    100    111      1     0     0   4.117D+01   5.259D+03
  F =   5258.6893316842516     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 
16


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   51.0s finished


0.9963822061431384
0.9920266666666666


In [287]:
from sklearn import metrics
print(model.classes_)
print(sklearn.metrics.confusion_matrix(y_test, model.predict(X_test), labels=['deu', 'eng', 'fra',  'ita','nld', 'por', 'rus', 'spa']))

sklearn.metrics.confusion_matrix(y_test, model.predict(X_test), labels=['kor', 'jpn', 'cmn'])

['ara' 'cmn' 'deu' 'eng' 'fra' 'hin' 'ita' 'jpn' 'kor' 'nld' 'por' 'rus'
 'spa' 'swe' 'tur' 'vie']
[[ 6862     9     4     6    16     2     0     4]
 [    6 18244    15    18    32     7     0    14]
 [    5     5  5824    15     4     7     0    11]
 [    1     9    18  9434     1    22     0    51]
 [    8     8     2     0  1773     0     0     3]
 [    1     6     8    31     3  4502     0    67]
 [    0     0     0     0     0     0 10485     0]
 [    2     3     5    30     3    47     0  4137]]


array([[ 101,    0,    0],
       [   0, 2560,    5],
       [   0,    0,  812]])

In [249]:
y_predict = model.predict(X_test)
print(np.where((y_predict == y_test) == False)[0].shape)
print(y_predict.shape)
i = 0 
for row in list(np.where((y_predict == y_test) == False))[0]:
    i += 1
    print(y_test[row], y_predict[row], idx[int(n_train + row)])
    if i == 10:
        break

(568,)
(75000,)
spa por 10160287
spa por 2166433
fra nld 9942249
deu eng 3707572
por spa 1229725
eng ita 10099817
jpn cmn 140784
spa por 5087616
nld eng 9253614
nld deu 1497388


In [288]:
(LANG, DIM) = model.coef_.shape
print(model.coef_.shape)
coef = np.float32(model.coef_)

print(float_array[0])
print(coef[0,0])
print(model.coef_[:,0])
print(float_array[0:10])

f = open("src/weights.rs", "w")

f.write("#[derive(Clone, Copy, Debug, Eq, PartialEq)]\n")
f.write("pub enum Lang {\n")
for lang in model.classes_:
    f.write("\t%s,\n" % lang.capitalize(),)
f.write("}\n\n")

f.write("""
impl Lang {
    pub fn three_letter_code(self)-> &'static str {
        match self {
""")
for lang in model.classes_:
    f.write("\t\t\tLang::%s => \"%s\",\n" % (lang.capitalize(), lang))
f.write("\t\t}\t}\n}\n\n\n")


f.write("pub const LANGUAGES: [Lang; %d] = [\n\t" % LANG)
for lang in model.classes_:
    f.write("Lang::%s, " % lang.capitalize())
f.write("];\n\n")

f.write("pub const WEIGHTS: [f32; %d] = [\n" % (LANG * DIM))
for i in range(DIM):
    f.write("\t")
    for val in coef[:, i]:
        f.write("%f, " % val)
    f.write("\n")
f.write("];\n\n")


f.write("pub const INTERCEPTS: [f32; %d] = [\n\t" % LANG)
for val in model.intercept_:
    f.write("%f, " % val)
f.write("];\n\n")


f.flush()
f.close()


(16, 2048)
-0.007360063958913088
-0.008390364
[-0.00839036  0.10523285  0.44693157 -0.12184247 -0.14644304 -0.00879355
  0.22627594 -0.03884726 -0.00951722 -0.30811627 -0.50813296 -0.00853365
  0.55440874 -0.38281637  0.21939595 -0.01081189]
array('f', [-0.007360063958913088, 0.515349805355072, -0.22763466835021973, 0.170366108417511, -0.011744972318410873, 0.14689676463603973, -0.16804201900959015, -0.35776984691619873, -0.011913723312318325, -0.04814738407731056])
