In [1]:
import os
import re
import numpy as np
import pandas as pd

from os.path import join as pjoin

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier


In [70]:
import joblib

def save_model(filename, model):
    if not filename.endswith(('.pkl', '.pickle')):
        filename += '.pickle'
    print(filename)
    joblib.dump(model, filename)

In [4]:
def is_nan(string):
    return string != string

### **Data Load**

In [6]:
train = pd.read_csv('train.csv')[['proc_text', 'label']]
train = train.sample(frac=1)

valid = pd.read_csv('valid.csv')[['proc_text', 'label']]
valid = valid.sample(frac=1)

test = pd.read_csv('test.csv')[['proc_text', 'label']]
test = test.sample(frac=1)

len(train), len(valid), len(test)

(132999, 28498, 28498)

In [7]:
train = train.dropna(axis=0)

valid = valid.dropna(axis=0)

test = test.dropna(axis=0)
len(train), len(valid), len(test)

(132861, 28470, 28457)

In [8]:
dev = pd.concat([train, valid], ignore_index=True)

In [9]:
x_dev = dev['proc_text']
y_dev = dev['label']

In [10]:
x_train = train['proc_text']
y_train = train['label']

x_valid = valid['proc_text']
y_valid = valid['label']

x_test = test['proc_text']
y_test = test['label']

In [11]:
count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(x_train)
x_valid_counts = count_vect.transform(x_valid)
x_test_counts = count_vect.transform(x_test)

x_train_counts.shape

(132861, 336627)

In [12]:
dev_cnt_vect = CountVectorizer()
x_dev_counts = dev_cnt_vect.fit_transform(x_dev)
x_dev_test_counts = dev_cnt_vect.transform(x_test)

dev_tfidf_transformer = TfidfTransformer()
x_dev_tfidf = dev_tfidf_transformer.fit_transform(x_dev_counts)
x_dev_test_tfidf = dev_tfidf_transformer.transform(x_dev_test_counts)

In [13]:
tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)
x_valid_tfidf = tfidf_transformer.transform(x_valid_counts)
x_test_tfidf = tfidf_transformer.transform(x_test_counts)

x_train_tfidf.shape

(132861, 336627)

### **Model Result**

In [None]:
result = pd.DataFrame(columns=['sentence', 'label', 'xgboost-pred', 'dt-pred', 'svc-pred'])
result['sentence'] = x_test
result['label'] = y_test

In [46]:
def test_model(model, input='ㅆ 너는 눈깔이 어디에 달렸냐'):
    if isinstance(model, XGBClassifier):
        print('XGBClassifier')
        input_test_counts = count_vect.transform([input])
        input_test_tfidf = tfidf_transformer.transform(input_test_counts)
    else:
        input_test_counts = dev_cnt_vect.transform([input])
        input_test_tfidf = dev_tfidf_transformer.transform(input_test_counts)
    pred = model.predict(input_test_tfidf)
    return pred

### **XGBoost**

In [40]:
xgb_wrapper = XGBClassifier(n_estimators = 5000, 
                            learning_rate = 0.1 , 
                            max_depth = 8,
                            min_child_weight=1,
                            gamma=0,
                            subsample=0.8,
                            colsample_bytree=0.8,
                            objective= 'binary:logistic',
                            # num_class=2,
                            nthread=-1,
                            scale_pos_weight=1,
                            seed=42
                            )
evals = [(x_valid_tfidf, y_valid)]
xgb_wrapper.fit(x_train_tfidf, y_train, early_stopping_rounds = 100, 
                eval_set = evals, verbose=True)

ws100_preds = xgb_wrapper.predict(x_test_tfidf)

[0]	validation_0-error:0.388479
Will train until validation_0-error hasn't improved in 100 rounds.
[1]	validation_0-error:0.38163
[2]	validation_0-error:0.375378
[3]	validation_0-error:0.370706
[4]	validation_0-error:0.370636
[5]	validation_0-error:0.366175
[6]	validation_0-error:0.359607
[7]	validation_0-error:0.353214
[8]	validation_0-error:0.353284
[9]	validation_0-error:0.347208
[10]	validation_0-error:0.347137
[11]	validation_0-error:0.344924
[12]	validation_0-error:0.339515
[13]	validation_0-error:0.336319
[14]	validation_0-error:0.334282
[15]	validation_0-error:0.334317
[16]	validation_0-error:0.331437
[17]	validation_0-error:0.331366
[18]	validation_0-error:0.331296
[19]	validation_0-error:0.325641
[20]	validation_0-error:0.325676
[21]	validation_0-error:0.321848
[22]	validation_0-error:0.321496
[23]	validation_0-error:0.321461
[24]	validation_0-error:0.319002
[25]	validation_0-error:0.317316
[26]	validation_0-error:0.317106
[27]	validation_0-error:0.315841
[28]	validation_0-er

In [43]:
np.mean(ws100_preds == y_test), f1_score(y_test, ws100_preds, average='macro')

(0.8494922163263872, 0.8493958059382059)

In [None]:
result['xgboost-pred'] = ws100_preds

### **Decision Tree**

In [21]:
dt_wrapper = DecisionTreeClassifier(
                            criterion = 'entropy',
                            max_depth = 500,
                            random_state = 42
                            )
dt_wrapper.fit(x_dev_tfidf, y_dev)
dt_preds = dt_wrapper.predict(x_dev_test_tfidf)

In [33]:
np.mean(dt_preds == y_test), f1_score(y_test, dt_preds, average=None)

(0.7901746494711319, array([0.80741816, 0.76953954]))

In [49]:
test_model(dt_wrapper, input='좆까')

array([0])

In [None]:
result['dt-pred'] = dt_preds

### **Support Vector Classifier**

In [14]:
svc_clf = SVC(
            C = 50, 
            kernel = 'rbf',
            gamma = 1
            )     

In [50]:
svc_clf.fit(x_dev_tfidf, y_dev)

SVC(C=50, gamma=1)

In [51]:
svc_preds = svc_clf.predict(x_dev_test_tfidf)
np.mean(svc_preds == y_test), f1_score(y_test, svc_preds, average='macro')

(0.8909231472045542, 0.890231424128916)

In [68]:
test_model(svc_clf, input='많이 까다롭구나...')

array([1])

In [None]:
result['svc-pred'] = svc_preds

In [71]:
save_model(model=svc_clf, filename='svc-model.pkl')
# clf_from_joblib = joblib.load('svc-model.pkl')

svc-model.pkl


### **KNeighbors**

In [24]:
kn_clf = KNeighborsClassifier(
                        n_neighbors=300, 
                        algorithm='ball_tree',
                        leaf_size=2,
                        n_jobs=-1)

kn_clf.fit(x_dev_tfidf, y_dev)
kn_preds = kn_clf.predict(x_dev_test_tfidf)



In [25]:
np.mean(kn_preds == y_test), f1_score(y_test, kn_preds, average='macro')

(0.5363882348806972, 0.3692710034992872)

In [19]:
result['kn-pred'] = kn_preds

### **Export Result**

In [None]:
result.to_csv('ml-result.csv', index=False)