In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import re

In [2]:
train = pd.read_csv('../data_worthcheck/train.csv', index_col=0)
test = pd.read_csv('../data_worthcheck/test.csv')
dev = pd.read_csv('../data_worthcheck/dev.csv')

In [3]:
print(train.info())
print(test.info())
print(dev.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21601 entries, 0 to 21600
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text_a  21601 non-null  object
 1   label   21601 non-null  object
dtypes: object(2)
memory usage: 506.3+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2800 entries, 0 to 2799
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text_a  2800 non-null   object
 1   label   2800 non-null   object
dtypes: object(2)
memory usage: 43.9+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2800 entries, 0 to 2799
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text_a  2800 non-null   object
 1   label   2800 non-null   object
dtypes: object(2)
memory usage: 43.9+ KB
None


# Preprocessing

1. Casefolding
2. Remove Non Alphabet (Numbers, Punctuation)
3. Tfidf Vectorizer

In [4]:
def preprocess(col):
    col_copy = col.copy()
    col_copy = col_copy.str.lower()
    col_copy = col_copy.str.replace(r'[^a-z\s]', ' ', regex=True)
    col_copy = col_copy.str.replace(r'\s+', ' ', regex=True).str.strip()
    # drop with len == 0
    col_copy = col_copy[col_copy.str.len() > 0]
    return col_copy

In [5]:
train_preprocessed_text = preprocess(train['text_a'])
test_preprocessed_text = preprocess(test['text_a'])
dev_preprocessed_text = preprocess(dev['text_a'])

label_train = train.loc[train_preprocessed_text.index, 'label']
label_test = test.loc[test_preprocessed_text.index, 'label']
label_dev = dev.loc[dev_preprocessed_text.index, 'label']

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer(analyzer='word',ngram_range=(1,1))
X_train = vect.fit_transform(train_preprocessed_text)
X_test = vect.transform(test_preprocessed_text)
X_dev = vect.transform(dev_preprocessed_text)

In [7]:
# encode label
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(label_train)
y_test = le.transform(label_test)
y_dev = le.transform(label_dev)

# Naive Bayes

In [8]:
# naive bayes
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train, y_train)

# SVM

In [11]:
# svm
from sklearn.svm import SVC
svm = SVC(kernel='rbf', C=1.0, random_state=0)
svm.fit(X_train, y_train)

# XGBoost

In [12]:
# xgboost
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

# Decision Tree

In [9]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

# Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

# Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train, y_train)

# Evaluation

In [15]:
# f1 score
from sklearn.metrics import f1_score

print('Naive Bayes')
print(f1_score(y_test, nb.predict(X_test)))
print('SVM')
print(f1_score(y_dev, svm.predict(X_dev)))
print('XGBoost')
print(f1_score(y_dev, xgb.predict(X_dev)))
print('Decision Tree')
print(f1_score(y_dev, dt.predict(X_dev)))
print('Random Forest')
print(f1_score(y_dev, rf.predict(X_dev)))
print('Logistic Regression')
print(f1_score(y_dev, lr.predict(X_dev)))

Naive Bayes
0.5962264150943396
SVM
0.7268588770864947
XGBoost
0.6374501992031872
Decision Tree
0.5960890087660149
Random Forest
0.6683459277917716
Logistic Regression
0.6961240310077519


# Grid Search

In [16]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

parameters = [
    {
        'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
        'vect__analyzer': ['word', 'char', 'char_wb'],
        'clf__alpha': [0.1, 1.0, 10.0],
    },
    # {
    #     'clf': [SVC()],
    #     'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
    #     'vect__analyzer': ['word', 'char', 'char_wb'],
    #     'clf__kernel': ['poly', 'rbf'],
    #     'clf__degree': [2, 3, 4],
    #     'clf__C': [0.1, 1.0, 10.0],
    # },
    {
        'clf': [XGBClassifier()],
        'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
        'vect__analyzer': ['word', 'char', 'char_wb'],
        'clf__max_depth': [3, 4, 5],
        'clf__learning_rate': [0.1, 0.2, 0.3],
        'clf__n_estimators': [100, 200, 300],
    },
    {
        'clf': [DecisionTreeClassifier()],
        'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
        'vect__analyzer': ['word', 'char', 'char_wb'],
        'clf__max_depth': [3, 4, 5],
        'clf__min_samples_split': [2, 3, 4],
        'clf__min_samples_leaf': [1, 2, 3],
    },
    {
        'clf': [RandomForestClassifier()],
        'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
        'vect__analyzer': ['word', 'char', 'char_wb'],
        'clf__max_depth': [3, 4, 5],
        'clf__min_samples_split': [2, 3, 4],
        'clf__min_samples_leaf': [1, 2, 3],
        'clf__n_estimators': [100, 200, 300],
    },
    {
        'clf': [LogisticRegression()],
        'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
        'vect__analyzer': ['word', 'char', 'char_wb'],
        'clf__penalty': ['l1', 'l2'],
        'clf__C': [0.1, 1.0, 10.0],
    },
]

In [17]:
from sklearn.model_selection import PredefinedSplit

train_fold = [-1 for _ in range(len(train_preprocessed_text))]
dev_fold = [0 for _ in range(len(dev_preprocessed_text))]
ps = PredefinedSplit(test_fold=np.concatenate([train_fold, dev_fold]))

X = pd.concat([train_preprocessed_text, dev_preprocessed_text]).reset_index(drop=True)
y = np.concatenate([y_train, y_dev])

In [18]:
for train_index, test_index in ps.split():
    # dev and test are the same
    assert any(X.loc[test_index].reset_index(drop=True) == test_preprocessed_text)
    assert any(y[test_index] == y_test)
    assert any(X.loc[test_index].reset_index(drop=True) == dev_preprocessed_text)
    assert any(y[test_index] == y_dev)

In [17]:
# gridsearch
# score f1
gs = GridSearchCV(Pipeline([('vect', TfidfVectorizer()),('clf', MultinomialNB())]), parameters, cv=ps, scoring='f1',verbose=3)
gs.fit(X, y)

Fitting 1 folds for each of 1296 candidates, totalling 1296 fits
[CV 1/1] END clf__alpha=0.1, vect__analyzer=word, vect__ngram_range=(1, 1);, score=0.693 total time=   0.2s
[CV 1/1] END clf__alpha=0.1, vect__analyzer=word, vect__ngram_range=(1, 2);, score=0.728 total time=   0.7s
[CV 1/1] END clf__alpha=0.1, vect__analyzer=word, vect__ngram_range=(2, 2);, score=0.672 total time=   0.5s
[CV 1/1] END clf__alpha=0.1, vect__analyzer=char, vect__ngram_range=(1, 1);, score=0.000 total time=   0.3s
[CV 1/1] END clf__alpha=0.1, vect__analyzer=char, vect__ngram_range=(1, 2);, score=0.173 total time=   1.1s
[CV 1/1] END clf__alpha=0.1, vect__analyzer=char, vect__ngram_range=(2, 2);, score=0.305 total time=   0.7s
[CV 1/1] END clf__alpha=0.1, vect__analyzer=char_wb, vect__ngram_range=(1, 1);, score=0.000 total time=   0.8s
[CV 1/1] END clf__alpha=0.1, vect__analyzer=char_wb, vect__ngram_range=(1, 2);, score=0.096 total time=   1.7s
[CV 1/1] END clf__alpha=0.1, vect__analyzer=char_wb, vect__ngram_

Traceback (most recent call last):
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV 1/1] END clf=LogisticRegression(), clf__C=0.1, clf__penalty=l1, vect__analyzer=word, vect__ngram_range=(1, 1);, score=nan total time=   0.2s


Traceback (most recent call last):
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV 1/1] END clf=LogisticRegression(), clf__C=0.1, clf__penalty=l1, vect__analyzer=word, vect__ngram_range=(1, 2);, score=nan total time=   0.7s


Traceback (most recent call last):
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV 1/1] END clf=LogisticRegression(), clf__C=0.1, clf__penalty=l1, vect__analyzer=word, vect__ngram_range=(2, 2);, score=nan total time=   0.5s


Traceback (most recent call last):
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV 1/1] END clf=LogisticRegression(), clf__C=0.1, clf__penalty=l1, vect__analyzer=char, vect__ngram_range=(1, 1);, score=nan total time=   0.3s


Traceback (most recent call last):
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV 1/1] END clf=LogisticRegression(), clf__C=0.1, clf__penalty=l1, vect__analyzer=char, vect__ngram_range=(1, 2);, score=nan total time=   1.0s


Traceback (most recent call last):
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV 1/1] END clf=LogisticRegression(), clf__C=0.1, clf__penalty=l1, vect__analyzer=char, vect__ngram_range=(2, 2);, score=nan total time=   0.7s


Traceback (most recent call last):
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV 1/1] END clf=LogisticRegression(), clf__C=0.1, clf__penalty=l1, vect__analyzer=char_wb, vect__ngram_range=(1, 1);, score=nan total time=   0.7s


Traceback (most recent call last):
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV 1/1] END clf=LogisticRegression(), clf__C=0.1, clf__penalty=l1, vect__analyzer=char_wb, vect__ngram_range=(1, 2);, score=nan total time=   1.5s


Traceback (most recent call last):
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV 1/1] END clf=LogisticRegression(), clf__C=0.1, clf__penalty=l1, vect__analyzer=char_wb, vect__ngram_range=(2, 2);, score=nan total time=   0.8s
[CV 1/1] END clf=LogisticRegression(), clf__C=0.1, clf__penalty=l2, vect__analyzer=word, vect__ngram_range=(1, 1);, score=0.496 total time=   0.4s
[CV 1/1] END clf=LogisticRegression(), clf__C=0.1, clf__penalty=l2, vect__analyzer=word, vect__ngram_range=(1, 2);, score=0.439 total time=   1.8s
[CV 1/1] END clf=LogisticRegression(), clf__C=0.1, clf__penalty=l2, vect__analyzer=word, vect__ngram_range=(2, 2);, score=0.317 total time=   1.2s
[CV 1/1] END clf=LogisticRegression(), clf__C=0.1, clf__penalty=l2, vect__analyzer=char, vect__ngram_range=(1, 1);, score=0.289 total time=   0.3s
[CV 1/1] END clf=LogisticRegression(), clf__C=0.1, clf__penalty=l2, vect__analyzer=char, vect__ngram_range=(1, 2);, score=0.557 total time=   1.3s
[CV 1/1] END clf=LogisticRegression(), clf__C=0.1, clf__penalty=l2, vect__analyzer=char, vect__ngram_range=(2, 2);, s

Traceback (most recent call last):
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV 1/1] END clf=LogisticRegression(), clf__C=1.0, clf__penalty=l1, vect__analyzer=word, vect__ngram_range=(1, 1);, score=nan total time=   0.2s


Traceback (most recent call last):
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV 1/1] END clf=LogisticRegression(), clf__C=1.0, clf__penalty=l1, vect__analyzer=word, vect__ngram_range=(1, 2);, score=nan total time=   0.7s


Traceback (most recent call last):
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV 1/1] END clf=LogisticRegression(), clf__C=1.0, clf__penalty=l1, vect__analyzer=word, vect__ngram_range=(2, 2);, score=nan total time=   0.5s


Traceback (most recent call last):
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV 1/1] END clf=LogisticRegression(), clf__C=1.0, clf__penalty=l1, vect__analyzer=char, vect__ngram_range=(1, 1);, score=nan total time=   0.3s


Traceback (most recent call last):
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV 1/1] END clf=LogisticRegression(), clf__C=1.0, clf__penalty=l1, vect__analyzer=char, vect__ngram_range=(1, 2);, score=nan total time=   1.0s


Traceback (most recent call last):
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV 1/1] END clf=LogisticRegression(), clf__C=1.0, clf__penalty=l1, vect__analyzer=char, vect__ngram_range=(2, 2);, score=nan total time=   0.7s


Traceback (most recent call last):
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV 1/1] END clf=LogisticRegression(), clf__C=1.0, clf__penalty=l1, vect__analyzer=char_wb, vect__ngram_range=(1, 1);, score=nan total time=   0.7s


Traceback (most recent call last):
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV 1/1] END clf=LogisticRegression(), clf__C=1.0, clf__penalty=l1, vect__analyzer=char_wb, vect__ngram_range=(1, 2);, score=nan total time=   1.5s


Traceback (most recent call last):
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV 1/1] END clf=LogisticRegression(), clf__C=1.0, clf__penalty=l1, vect__analyzer=char_wb, vect__ngram_range=(2, 2);, score=nan total time=   0.8s
[CV 1/1] END clf=LogisticRegression(), clf__C=1.0, clf__penalty=l2, vect__analyzer=word, vect__ngram_range=(1, 1);, score=0.695 total time=   0.8s
[CV 1/1] END clf=LogisticRegression(), clf__C=1.0, clf__penalty=l2, vect__analyzer=word, vect__ngram_range=(1, 2);, score=0.685 total time=   2.7s
[CV 1/1] END clf=LogisticRegression(), clf__C=1.0, clf__penalty=l2, vect__analyzer=word, vect__ngram_range=(2, 2);, score=0.540 total time=   1.7s
[CV 1/1] END clf=LogisticRegression(), clf__C=1.0, clf__penalty=l2, vect__analyzer=char, vect__ngram_range=(1, 1);, score=0.413 total time=   0.4s
[CV 1/1] END clf=LogisticRegression(), clf__C=1.0, clf__penalty=l2, vect__analyzer=char, vect__ngram_range=(1, 2);, score=0.626 total time=   1.4s
[CV 1/1] END clf=LogisticRegression(), clf__C=1.0, clf__penalty=l2, vect__analyzer=char, vect__ngram_range=(2, 2);, s

Traceback (most recent call last):
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV 1/1] END clf=LogisticRegression(), clf__C=10.0, clf__penalty=l1, vect__analyzer=word, vect__ngram_range=(1, 1);, score=nan total time=   0.2s


Traceback (most recent call last):
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV 1/1] END clf=LogisticRegression(), clf__C=10.0, clf__penalty=l1, vect__analyzer=word, vect__ngram_range=(1, 2);, score=nan total time=   0.7s


Traceback (most recent call last):
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV 1/1] END clf=LogisticRegression(), clf__C=10.0, clf__penalty=l1, vect__analyzer=word, vect__ngram_range=(2, 2);, score=nan total time=   0.5s


Traceback (most recent call last):
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV 1/1] END clf=LogisticRegression(), clf__C=10.0, clf__penalty=l1, vect__analyzer=char, vect__ngram_range=(1, 1);, score=nan total time=   0.3s


Traceback (most recent call last):
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV 1/1] END clf=LogisticRegression(), clf__C=10.0, clf__penalty=l1, vect__analyzer=char, vect__ngram_range=(1, 2);, score=nan total time=   1.0s


Traceback (most recent call last):
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV 1/1] END clf=LogisticRegression(), clf__C=10.0, clf__penalty=l1, vect__analyzer=char, vect__ngram_range=(2, 2);, score=nan total time=   0.6s


Traceback (most recent call last):
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV 1/1] END clf=LogisticRegression(), clf__C=10.0, clf__penalty=l1, vect__analyzer=char_wb, vect__ngram_range=(1, 1);, score=nan total time=   0.7s


Traceback (most recent call last):
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV 1/1] END clf=LogisticRegression(), clf__C=10.0, clf__penalty=l1, vect__analyzer=char_wb, vect__ngram_range=(1, 2);, score=nan total time=   1.5s


Traceback (most recent call last):
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV 1/1] END clf=LogisticRegression(), clf__C=10.0, clf__penalty=l1, vect__analyzer=char_wb, vect__ngram_range=(2, 2);, score=nan total time=   0.8s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/1] END clf=LogisticRegression(), clf__C=10.0, clf__penalty=l2, vect__analyzer=word, vect__ngram_range=(1, 1);, score=0.715 total time=   0.8s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/1] END clf=LogisticRegression(), clf__C=10.0, clf__penalty=l2, vect__analyzer=word, vect__ngram_range=(1, 2);, score=0.724 total time=   2.8s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/1] END clf=LogisticRegression(), clf__C=10.0, clf__penalty=l2, vect__analyzer=word, vect__ngram_range=(2, 2);, score=0.633 total time=   2.1s
[CV 1/1] END clf=LogisticRegression(), clf__C=10.0, clf__penalty=l2, vect__analyzer=char, vect__ngram_range=(1, 1);, score=0.417 total time=   0.4s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/1] END clf=LogisticRegression(), clf__C=10.0, clf__penalty=l2, vect__analyzer=char, vect__ngram_range=(1, 2);, score=0.622 total time=   1.5s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/1] END clf=LogisticRegression(), clf__C=10.0, clf__penalty=l2, vect__analyzer=char, vect__ngram_range=(2, 2);, score=0.625 total time=   1.0s
[CV 1/1] END clf=LogisticRegression(), clf__C=10.0, clf__penalty=l2, vect__analyzer=char_wb, vect__ngram_range=(1, 1);, score=0.419 total time=   0.9s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/1] END clf=LogisticRegression(), clf__C=10.0, clf__penalty=l2, vect__analyzer=char_wb, vect__ngram_range=(1, 2);, score=0.624 total time=   2.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/1] END clf=LogisticRegression(), clf__C=10.0, clf__penalty=l2, vect__analyzer=char_wb, vect__ngram_range=(2, 2);, score=0.624 total time=   1.3s


GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
             estimator=Pipeline(steps=[('vect', TfidfVectorizer()),
                                       ('clf', MultinomialNB())]),
             param_grid=[{'clf__alpha': [0.1, 1.0, 10.0],
                          'vect__analyzer': ['word', 'char', 'char_wb'],
                          'vect__ngram_range': [(1, 1), (1, 2), (2, 2)]},
                         {'clf': [XGBClassifier(base_score=None, booster=None,
                                                callbacks=None,
                                                colsample_byle...
                          'clf__min_samples_leaf': [1, 2, 3],
                          'clf__min_samples_split': [2, 3, 4],
                          'clf__n_estimators': [100, 200, 300],
                          'vect__analyzer': ['word', 'char', 'char_wb'],
                          'vect__ngram_range': [(1, 1), (1, 2), (2, 2)]},
                         {'clf': [Logistic

In [51]:
gs.best_params_

{'clf__alpha': 0.1, 'vect__analyzer': 'word', 'vect__ngram_range': (1, 2)}

In [59]:
gs.best_score_

0.728110599078341

In [None]:
svc_pipeline = Pipeline([('vect', TfidfVectorizer()),('clf', SVC())])

svc_params = {
        'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
        'vect__analyzer': ['word', 'char', 'char_wb'],
        'clf__kernel': ['poly', 'rbf'],
        # 'clf__degree': [2, 3, 4],
        # 'clf__C': [0.1, 1.0, 10.0],
    }

svc_gs = GridSearchCV(svc_pipeline, svc_params, cv=ps, scoring='f1',verbose=3)

In [None]:
svc_gs.fit(X, y)

Fitting 1 folds for each of 18 candidates, totalling 18 fits
[CV 1/1] END clf__kernel=poly, vect__analyzer=word, vect__ngram_range=(1, 1);, score=0.444 total time= 1.8min
[CV 1/1] END clf__kernel=poly, vect__analyzer=word, vect__ngram_range=(1, 2);, score=0.324 total time= 3.1min
[CV 1/1] END clf__kernel=poly, vect__analyzer=word, vect__ngram_range=(2, 2);, score=0.315 total time= 2.6min
[CV 1/1] END clf__kernel=poly, vect__analyzer=char, vect__ngram_range=(1, 1);, score=0.537 total time=  25.3s
[CV 1/1] END clf__kernel=poly, vect__analyzer=char, vect__ngram_range=(1, 2);, score=0.695 total time= 5.0min
[CV 1/1] END clf__kernel=poly, vect__analyzer=char, vect__ngram_range=(2, 2);, score=0.688 total time= 7.1min
[CV 1/1] END clf__kernel=poly, vect__analyzer=char_wb, vect__ngram_range=(1, 1);, score=0.479 total time=  20.7s
[CV 1/1] END clf__kernel=poly, vect__analyzer=char_wb, vect__ngram_range=(1, 2);, score=0.680 total time= 3.6min
[CV 1/1] END clf__kernel=poly, vect__analyzer=char_wb

GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
             estimator=Pipeline(steps=[('vect', TfidfVectorizer()),
                                       ('clf', SVC())]),
             param_grid={'clf__kernel': ['poly', 'rbf'],
                         'vect__analyzer': ['word', 'char', 'char_wb'],
                         'vect__ngram_range': [(1, 1), (1, 2), (2, 2)]},
             scoring='f1', verbose=3)

In [22]:
svc_gs.best_estimator_

Pipeline(steps=[('vect', TfidfVectorizer()), ('clf', SVC())])

In [23]:
svc_gs.best_score_

0.7268588770864947

In [148]:
svc = Pipeline([('vect', TfidfVectorizer(analyzer='word', ngram_range=(1, 2))),('clf', SVC(kernel='rbf'))])
svc.fit(train_preprocessed_text, y_train)

Pipeline(steps=[('vect', TfidfVectorizer(ngram_range=(1, 2))), ('clf', SVC())])

In [149]:
f1_score(y_test, svc.predict(test_preprocessed_text))

0.715925394548063

# Data Re-Preprocessing

## Normalize

In [20]:
def build_alay_dict():
    df = pd.read_csv('https://raw.githubusercontent.com/nasalsabila/kamus-alay/master/colloquial-indonesian-lexicon.csv')
    return dict(zip(df['slang'], df['formal']))

def translate_alay(input_string, alay_dict):
    string_splitted = input_string.split(" ")
    for i in range(len(string_splitted)):
        if(string_splitted[i] in alay_dict):
            string_splitted[i] = alay_dict[string_splitted[i]]
    return ' '.join(string_splitted)

In [21]:
alay_dict = build_alay_dict()
train_normalized = train_preprocessed_text.apply(lambda x: translate_alay(x, alay_dict))
dev_normalized = dev_preprocessed_text.apply(lambda x: translate_alay(x, alay_dict))
test_normalized = test_preprocessed_text.apply(lambda x: translate_alay(x, alay_dict))

In [22]:
nb = Pipeline([('vect', TfidfVectorizer(analyzer='word',ngram_range=(1,2))),('clf', MultinomialNB(alpha=0.1))])
nb.fit(train_normalized, y_train)

In [23]:
f1_score(y_dev, nb.predict(dev_normalized))

0.7243243243243244

In [24]:
svm = Pipeline([('vect', TfidfVectorizer(analyzer='word',ngram_range=(1,1))),('clf', SVC(kernel='rbf'))])
svm.fit(train_normalized, y_train)

In [25]:
f1_score(y_dev, svm.predict(dev_normalized))

0.7246811702925732

## Handle imbalance

In [26]:
# undersampling
def undersampling(text_series, labels, random_state=42):
    result_text_series = pd.Series(dtype='object')
    result_labels = np.array([])
    # value counts
    value_counts = np.unique(labels, return_counts=True)
    # get the label with minimum value
    min_label = value_counts[0][np.argmin(value_counts[1])]
    # print(value_counts)
    # print(min_label)
    # append the text with min label (and also to label np array)
    result_text_series = result_text_series.append(text_series[labels == min_label])
    result_labels = np.append(result_labels, labels[labels == min_label])
    # number of samples that label is min_label
    min_label_count = value_counts[1][min_label]
    # print(min_label_count)
    # resample the other labels
    for label in value_counts[0]:
        if label != min_label:
            # resample the text
            resampled_text = text_series[labels == label].sample(n=min_label_count, random_state=random_state)
            resampled_label = labels[resampled_text.index]
            # append the text with min label (and also to label np array)
            result_text_series = result_text_series.append(resampled_text)
            result_labels = np.append(result_labels, resampled_label)
    return result_text_series.reset_index(drop=True), result_labels

In [27]:
undersampled_train, undersampled_label = undersampling(train_preprocessed_text, y_train)

In [28]:
nb = Pipeline([('vect', TfidfVectorizer(analyzer='word',ngram_range=(1,2))),('clf', MultinomialNB(alpha=0.1))])
nb.fit(undersampled_train, undersampled_label)

In [29]:
f1_score(y_dev, nb.predict(dev_normalized))

0.5336016096579477

In [30]:
svm = Pipeline([('vect', TfidfVectorizer(analyzer='word',ngram_range=(1,1))),('clf', SVC(kernel='rbf'))])
svm.fit(undersampled_train, undersampled_label)

In [31]:
f1_score(y_dev, svm.predict(dev_normalized))

0.628992628992629

# Try Word2Vec

In [32]:
import gensim
from gensim.models import Word2Vec

## CBOW

In [46]:
# Create CBOW model
cbow = gensim.models.Word2Vec(train_preprocessed_text, min_count = 1, window = 5, vector_size = 1000)

# Skip gram

In [47]:
# Create Skip Gram model
skipgram = gensim.models.Word2Vec(train_preprocessed_text, min_count = 1, window = 5, sg = 1, vector_size = 1000)

## Usage

In [48]:
def get_vector(word, model):
    try:
        return model.wv[word]
    except:
        return np.zeros(1000)

In [49]:
# create train dataset
# cbow
train_cbow = np.array([np.mean([get_vector(word, cbow) for word in sentence.split()], axis=0) for sentence in train_preprocessed_text])
# skipgram
train_skipgram = np.array([np.mean([get_vector(word, skipgram) for word in sentence.split()], axis=0) for sentence in train_preprocessed_text])

In [50]:
w2v_svm = Pipeline([('clf', SVC(kernel='rbf'))])
w2v_svm.fit(train_cbow, y_train)

In [51]:
dev_cbow = np.array([np.mean([get_vector(word, cbow) for word in sentence.split()], axis=0) for sentence in dev_preprocessed_text])
dev_skipgram = np.array([np.mean([get_vector(word, skipgram) for word in sentence.split()], axis=0) for sentence in dev_preprocessed_text])

In [52]:
f1_score(y_dev, w2v_svm.predict(dev_cbow))

0.1839572192513369

## GridSearch with Word2Vec

In [53]:
parameters2 = [
    {
        'clf__C': [0.1, 1.0, 10.0],
    },
    {
        'clf': [XGBClassifier()],
        'clf__max_depth': [3, 4, 5],
        'clf__learning_rate': [0.1, 0.2, 0.3],
        'clf__n_estimators': [100, 200, 300],
    },
    {
        'clf': [DecisionTreeClassifier()],
        'clf__max_depth': [3, 4, 5],
        'clf__min_samples_split': [2, 3, 4],
        'clf__min_samples_leaf': [1, 2, 3],
    },
    {
        'clf': [RandomForestClassifier()],
        'clf__max_depth': [3, 4, 5],
        'clf__min_samples_split': [2, 3, 4],
        'clf__min_samples_leaf': [1, 2, 3],
        'clf__n_estimators': [100, 200, 300],
    },
    {
        'clf': [LogisticRegression()],
        'clf__penalty': ['l1', 'l2'],
        'clf__C': [0.1, 1.0, 10.0],
    },
]

In [54]:
train_fold = [-1 for _ in range(len(train_preprocessed_text))]
dev_fold = [0 for _ in range(len(dev_preprocessed_text))]
ps = PredefinedSplit(test_fold=np.concatenate([train_fold, dev_fold]))

Xcbow = np.concatenate([train_cbow, dev_cbow])
ycbow = np.concatenate([y_train, y_dev])

In [55]:
Xskipgram = np.concatenate([train_skipgram, dev_skipgram])
yskipgram = np.concatenate([y_train, y_dev])

In [56]:
gs_cbow = GridSearchCV(Pipeline([('clf', SVC(kernel='rbf'))]), parameters2, cv=ps, scoring='f1', n_jobs=-1, verbose=2)
gs_cbow.fit(Xcbow, ycbow)

Fitting 1 folds for each of 144 candidates, totalling 144 fits


3 fits failed out of a total of 144.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\pipeline.py", line 382, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 61, 

In [57]:
gs_cbow.best_params_

{'clf': RandomForestClassifier(max_depth=5, min_samples_leaf=3, n_estimators=300),
 'clf__max_depth': 5,
 'clf__min_samples_leaf': 3,
 'clf__min_samples_split': 2,
 'clf__n_estimators': 300}

In [58]:
gs_cbow.best_score_

0.20920502092050208

In [59]:
gs_skipgram = GridSearchCV(Pipeline([('clf', SVC(kernel='rbf'))]), parameters2, cv=ps, scoring='f1', n_jobs=-1, verbose=2)
gs_skipgram.fit(Xskipgram, yskipgram)

Fitting 1 folds for each of 144 candidates, totalling 144 fits


3 fits failed out of a total of 144.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\pipeline.py", line 382, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\Users\rando\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 61, 

In [60]:
gs_skipgram.best_params_

{'clf': DecisionTreeClassifier(max_depth=3),
 'clf__max_depth': 3,
 'clf__min_samples_leaf': 1,
 'clf__min_samples_split': 2}

In [61]:
gs_skipgram.best_score_

0.24485798237022527

# Conclusion

Kesimpulannya adalah, yang terbaik adalah menggunakan TF-IDF Vectorizer dan Mulinomial Naive Bayes dengan hasil validasi f1 0.7281 

In [19]:
nb = Pipeline([('vect', TfidfVectorizer(analyzer='word',ngram_range=(1,2))),('clf', MultinomialNB(alpha=0.1))])
nb.fit(train_preprocessed_text, y_train)

In [20]:
# classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, nb.predict(test_preprocessed_text)))
print("F1 score (micro avg): ", f1_score(y_test, nb.predict(test_preprocessed_text)))
print("F1 score (macro avg): ", f1_score(y_test, nb.predict(test_preprocessed_text), average='macro'))

              precision    recall  f1-score   support

           0       0.89      0.94      0.92      2093
           1       0.80      0.67      0.73       707

    accuracy                           0.87      2800
   macro avg       0.85      0.81      0.82      2800
weighted avg       0.87      0.87      0.87      2800

F1 score (micro avg):  0.728110599078341
F1 score (macro avg):  0.8228733544484306
