In [None]:
!pip install --upgrade gensim

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
import missingno as msno
from tqdm import tqdm, trange
import pickle
import random

from gensim.models.fasttext import FastText

from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression

In [None]:
files = glob('/content/drive/MyDrive/공모전/data/*.txt')
for i, file in enumerate(files):
    globals()[f'file{i}'] = pd.read_table(file, sep='|', encoding='cp949')

In [None]:
idx2label_digit1 = dict(enumerate(sorted(file0.digit_1.unique())))
label2idx_digit1 = {label:idx for idx, label in enumerate(sorted(file0.digit_1.unique()))}
idx2label_digit2 = dict(enumerate(sorted(file0.digit_2.unique())))
label2idx_digit2 = {label:idx for idx, label in enumerate(sorted(file0.digit_2.unique()))}
idx2label_digit3 = dict(enumerate(sorted(file0.digit_3.unique())))
label2idx_digit3 = {label:idx for idx, label in enumerate(sorted(file0.digit_3.unique()))}

In [None]:
file0['digit_1'] = file0['digit_1'].map(lambda x: label2idx_digit1[x])
file0['digit_2'] = file0['digit_2'].map(lambda x: label2idx_digit2[x])
file0['digit_3'] = file0['digit_3'].map(lambda x: label2idx_digit3[x])

### FastText

In [None]:
texts = pd.DataFrame({'texts':list(file0[['text_obj', 'text_mthd', 'text_deal']].fillna('').values)}, index=range(file0.shape[0]))
data = pd.concat([file0[['digit_1','digit_2','digit_3']], texts], axis=1)

In [None]:
def oversample(x, n):
    lst = []
    for i in x:
        tmp = []
        for j in range(n):
            random.shuffle(i)
            tmp += list(i)
        lst.append(tmp)
    return lst

In [None]:
def groupby_digit(data, target_col, text_col):
    target_idx = np.argwhere(list(map(lambda x: x==target_col, data.columns)))[0][0]
    text_idx = np.argwhere(list(map(lambda x: x==text_col, data.columns)))[0][0]
    lst = data[target_col].unique()
    for label in lst:
        globals()[f'lst_{label}'] = []
    for d in data.values:
        globals()[f'lst_{d[target_idx]}'].extend([d[text_idx]])
    res = []
    for label in lst:
        res.append(globals()[f'lst_{label}'])
    res = oversample(res, 20)
    return res

def combine_texts(lst):
    res = ' '.join(lst).strip()
    return res

def collect_embs(data, col):
    lst = []
    col_name = col[5:]
    for w in tqdm(data.fillna('')[col]):
        lst.append(globals()[f'ft_{col_name}'].wv[w])
    return lst

In [None]:
res_obj3 = groupby_digit(file0.fillna('<unk>'), 'digit_3', 'text_obj')
res_mthd3 = groupby_digit(file0.fillna('<unk>'), 'digit_3', 'text_mthd')
res_deal3 = groupby_digit(file0.fillna('<unk>'), 'digit_3', 'text_deal')

#### size=50

In [None]:
ft_obj = FastText(res_obj3, sg=1, vector_size=50)
ft_mthd = FastText(res_mthd3, sg=1, vector_size=50)
ft_deal = FastText(res_deal3, sg=1, vector_size=50)

In [None]:
text_obj = pd.DataFrame(collect_embs(file0.fillna('<unk>'), 'text_obj'), columns=[f'text_obj{i}' for i in range(50)])
text_mthd = pd.DataFrame(collect_embs(file0.fillna('<unk>'), 'text_mthd'), columns=[f'text_mthd{i}' for i in range(50)])
text_deal = pd.DataFrame(collect_embs(file0.fillna('<unk>'), 'text_deal'), columns=[f'text_deal{i}' for i in range(50)])

In [None]:
X = pd.concat([text_obj, text_mthd, text_deal], axis=1)
y = file0[['digit_1', 'digit_2', 'digit_3']]

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.3, random_state=0)

In [None]:
y_train1, y_train2, y_train3 = y_train.iloc[:,0], y_train.iloc[:,1], y_train.iloc[:,2]
y_val1, y_val2, y_val3 = y_val.iloc[:,0], y_val.iloc[:,1], y_val.iloc[:,2]

In [None]:
lr1 = LogisticRegression(random_state=0)

lr1.fit(X_train, y_train1)
lr1_preds = lr1.predict(X_val)

print(f'accuracy: {accuracy_score(y_val1, lr1_preds)}')
print(f'f1_score: {f1_score(y_val1, lr1_preds, average="macro")}')

# accuracy: 0.93848
# f1_score: 0.8632458207979568

In [None]:
lr2 = LogisticRegression(random_state=0)

lr2.fit(X_train, y_train2)
lr2_preds = lr2.predict(X_val)

print(f'accuracy: {accuracy_score(y_val2, lr2_preds)}')
print(f'f1_score: {f1_score(y_val2, lr2_preds, average="macro")}')

# accuracy: 0.8933366666666667
# f1_score: 0.6452915384453939

In [None]:
lr3 = LogisticRegression(random_state=0)

lr3.fit(X_train, y_train3)
lr3_preds = lr3.predict(X_val)

print(f'accuracy: {accuracy_score(y_val3, lr3_preds)}')
print(f'f1_score: {f1_score(y_val3, lr3_preds, average="macro")}')

# accuracy: 0.8241166666666667
# f1_score: 0.49906697377751386

#### size=100

In [None]:
ft_obj = FastText(res_obj3, sg=1, vector_size=100)
ft_mthd = FastText(res_mthd3, sg=1, vector_size=100)
ft_deal = FastText(res_deal3, sg=1, vector_size=100)

In [None]:
text_obj = pd.DataFrame(collect_embs(file0.fillna('<unk>'), 'text_obj'), columns=[f'text_obj{i}' for i in range(100)])
text_mthd = pd.DataFrame(collect_embs(file0.fillna('<unk>'), 'text_mthd'), columns=[f'text_mthd{i}' for i in range(100)])
text_deal = pd.DataFrame(collect_embs(file0.fillna('<unk>'), 'text_deal'), columns=[f'text_deal{i}' for i in range(100)])

In [None]:
X = pd.concat([text_obj, text_mthd, text_deal], axis=1)
y = file0[['digit_1', 'digit_2', 'digit_3']]

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.3, random_state=0)

In [None]:
y_train1, y_train2, y_train3 = y_train.iloc[:,0], y_train.iloc[:,1], y_train.iloc[:,2]
y_val1, y_val2, y_val3 = y_val.iloc[:,0], y_val.iloc[:,1], y_val.iloc[:,2]

In [None]:
lr1 = LogisticRegression(random_state=0)

lr1.fit(X_train, y_train1)
lr1_preds = lr1.predict(X_val)

print(f'accuracy: {accuracy_score(y_val1, lr1_preds)}')
print(f'f1_score: {f1_score(y_val1, lr1_preds, average="macro")}')

# accuracy: 0.939
# f1_score: 0.864103859745507

In [None]:
lr2 = LogisticRegression(random_state=0)

lr2.fit(X_train, y_train2)
lr2_preds = lr2.predict(X_val)

print(f'accuracy: {accuracy_score(y_val2, lr2_preds)}')
print(f'f1_score: {f1_score(y_val2, lr2_preds, average="macro")}')

# accuracy: 0.8925666666666666
# f1_score: 0.641267176474703

In [None]:
lr3 = LogisticRegression(random_state=0)

lr3.fit(X_train, y_train3)
lr3_preds = lr3.predict(X_val)

print(f'accuracy: {accuracy_score(y_val3, lr3_preds)}')
print(f'f1_score: {f1_score(y_val3, lr3_preds, average="macro")}')

# accuracy: 0.8203
# f1_score: 0.49159866754532433

#### size=150

In [None]:
ft_obj = FastText(res_obj3, sg=1, vector_size=150)
ft_mthd = FastText(res_mthd3, sg=1, vector_size=150)
ft_deal = FastText(res_deal3, sg=1, vector_size=150)

In [None]:
text_obj = pd.DataFrame(collect_embs(file0.fillna('<unk>'), 'text_obj'), columns=[f'text_obj{i}' for i in range(150)])
text_mthd = pd.DataFrame(collect_embs(file0.fillna('<unk>'), 'text_mthd'), columns=[f'text_mthd{i}' for i in range(150)])
text_deal = pd.DataFrame(collect_embs(file0.fillna('<unk>'), 'text_deal'), columns=[f'text_deal{i}' for i in range(150)])

In [None]:
X = pd.concat([text_obj, text_mthd, text_deal], axis=1)
y = file0[['digit_1', 'digit_2', 'digit_3']]

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.3, random_state=0)

In [None]:
y_train1, y_train2, y_train3 = y_train.iloc[:,0], y_train.iloc[:,1], y_train.iloc[:,2]
y_val1, y_val2, y_val3 = y_val.iloc[:,0], y_val.iloc[:,1], y_val.iloc[:,2]

In [None]:
lr1 = LogisticRegression(random_state=0)

lr1.fit(X_train, y_train1)
lr1_preds = lr1.predict(X_val)

print(f'accuracy: {accuracy_score(y_val1, lr1_preds)}')
print(f'f1_score: {f1_score(y_val1, lr1_preds, average="macro")}')

# accuracy: 0.9392133333333333
# f1_score: 0.8651085652869795

In [None]:
lr2 = LogisticRegression(random_state=0)

lr2.fit(X_train, y_train2)
lr2_preds = lr2.predict(X_val)

print(f'accuracy: {accuracy_score(y_val2, lr2_preds)}')
print(f'f1_score: {f1_score(y_val2, lr2_preds, average="macro")}')

# accuracy: 0.8926966666666667
# f1_score: 0.6432172048759325

In [None]:
lr3 = LogisticRegression(random_state=0)

lr3.fit(X_train, y_train3)
lr3_preds = lr3.predict(X_val)

print(f'accuracy: {accuracy_score(y_val3, lr3_preds)}')
print(f'f1_score: {f1_score(y_val3, lr3_preds, average="macro")}')

# accuracy: 0.8189733333333333
# f1_score: 0.49088425833191346

#### size=200

In [None]:
ft_obj = FastText(res_obj3, sg=1, vector_size=200)
ft_mthd = FastText(res_mthd3, sg=1, vector_size=200)
ft_deal = FastText(res_deal3, sg=1, vector_size=200)

In [None]:
text_obj = pd.DataFrame(collect_embs(file0.fillna('<unk>'), 'text_obj'), columns=[f'text_obj{i}' for i in range(200)])
text_mthd = pd.DataFrame(collect_embs(file0.fillna('<unk>'), 'text_mthd'), columns=[f'text_mthd{i}' for i in range(200)])
text_deal = pd.DataFrame(collect_embs(file0.fillna('<unk>'), 'text_deal'), columns=[f'text_deal{i}' for i in range(200)])

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.3, random_state=0)

In [None]:
y_train1, y_train2, y_train3 = y_train.iloc[:,0], y_train.iloc[:,1], y_train.iloc[:,2]
y_val1, y_val2, y_val3 = y_val.iloc[:,0], y_val.iloc[:,1], y_val.iloc[:,2]

In [None]:
lr1 = LogisticRegression(random_state=0)

lr1.fit(X_train, y_train1)
lr1_preds = lr1.predict(X_val)

print(f'accuracy: {accuracy_score(y_val1, lr1_preds)}')
print(f'f1_score: {f1_score(y_val1, lr1_preds, average="macro")}')

# accuracy: 0.9392133333333333
# f1_score: 0.8651085652869795

In [None]:
lr2 = LogisticRegression(random_state=0)

lr2.fit(X_train, y_train2)
lr2_preds = lr2.predict(X_val)

print(f'accuracy: {accuracy_score(y_val2, lr2_preds)}')
print(f'f1_score: {f1_score(y_val2, lr2_preds, average="macro")}')

# accuracy: 0.8926966666666667
# f1_score: 0.6432172048759325

In [None]:
lr3 = LogisticRegression(random_state=0)

lr3.fit(X_train, y_train3)
lr3_preds = lr3.predict(X_val)

print(f'accuracy: {accuracy_score(y_val3, lr3_preds)}')
print(f'f1_score: {f1_score(y_val3, lr3_preds, average="macro")}')

# accuracy: 0.8189733333333333
# f1_score: 0.49088425833191346