## Impressions and Reviews  
- tfidf를 두개로 수행하여 하나는 unigram, 하나는 2~4 gram을 수행하여 output feature들을 concat하여 최종 X를 생성

In [None]:
"""
Logistic Regression

    solver:
    - liblinear : 적은 data에 적합, L1, L2 모두 지원
    - sag, saga : 대용량 data에 적합(sgd기반), sag는 L1, saga는 L2,L1 모두 지원
    - newton-cg, lbfgs (sag, saga) 는 multi-class분류에 사용, lbfgs 성능이 최고로 알려짐. 둘다 L2만 지원
    
    penalty: solver에 따라 l1 또는 l2 만 가능하거나 혹은 둘다 가능하다, C 로 강도를 조절 가능
    
    C: 높을수록 낮은 regularization 효과 (svm), penalty ('l1', 'l2') 의 강도 결정, 
    
    
    """

In [None]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_union

class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv('../input/train.csv.zip').fillna(' ')
test = pd.read_csv('../input/test.csv.zip').fillna(' ')

train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

In [None]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word', # 단어 단위로 진행
    token_pattern=r'\w{1,}', # 한글자 이상의 단어 들을 각각 구분한다
    ngram_range=(1, 1), # only unigram (한단어)
    max_features=30000)

char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char', # 음절단위로 진행
    ngram_range=(1, 4), # 1 ~ 4음절 단어까지 count
    max_features=30000)

vectorizer = make_union(word_vectorizer, char_vectorizer, n_jobs=2)

In [None]:
vectorizer.fit(all_text)
train_features = vectorizer.transform(train_text)
test_features = vectorizer.transform(test_text)

In [None]:
word_vectorizer.fit(all_text[:10000])
char_vectorizer.fit(all_text[:10000])

TfidfVectorizer(analyzer='char', max_features=30000, ngram_range=(1, 4),
                strip_accents='unicode', sublinear_tf=True)

In [None]:
print(sorted(word_vectorizer.vocabulary_.items(),reverse = True)[:20])

[('飞天号航天服', 29999), ('飞天', 29998), ('電視劇', 29997), ('雲水', 29996), ('連絡', 29995), ('迷惑', 29994), ('豆田', 29993), ('話して下さい', 29992), ('見学', 29991), ('聖やや', 29990), ('翻译', 29989), ('翻訳', 29988), ('竜龙', 29987), ('竜龍', 29986), ('福原路草', 29985), ('福原信三', 29984), ('甲子夜話', 29983), ('琉竜', 29982), ('牛岩', 29981), ('江戸後期の平戸藩主', 29980)]


In [None]:
print(sorted(word_vectorizer.vocabulary_.items(),reverse = False)[1000])

('50th', 1000)


In [None]:
print(sorted(word_vectorizer.vocabulary_.items(),reverse = False)[:20])

[('0', 0), ('00', 1), ('000', 2), ('0000', 3), ('000000', 4), ('00000000', 5), ('00053', 6), ('001', 7), ('0010steve', 8), ('002', 9), ('0022', 10), ('003_resize', 11), ('004', 12), ('0049703', 13), ('005', 14), ('005113', 15), ('006400', 16), ('007', 17), ('007191', 18), ('01', 19)]


In [None]:
print(sorted(char_vectorizer.vocabulary_.items(),reverse = True)[:20])

[('♦', 29999), ('♥', 29998), ('→ ', 29997), ('→', 29996), ('• t', 29995), ('• co', 29994), ('• c', 29993), ('• "', 29992), ('• ', 29991), ('•', 29990), ('”.', 29989), ('” i', 29988), ('” an', 29987), ('” a', 29986), ('” ', 29985), ('”', 29984), ('“the', 29983), ('“th', 29982), ('“t', 29981), ('“s', 29980)]


In [None]:
print(sorted(char_vectorizer.vocabulary_.items(),reverse = False)[1000])

(' ded', 1000)


In [None]:
print(sorted(char_vectorizer.vocabulary_.items(),reverse = False)[:20])

[('\n', 0), ('\n!', 1), ('\n! ', 2), ('\n"', 3), ('\n""', 4), ('\n""i', 5), ('\n""t', 6), ('\n-', 7), ('\n- ', 8), ('\n1', 9), ('\n2', 10), ('\n3', 11), ('\n4', 12), ('\n:', 13), ('\n:i', 14), ('\n:im', 15), ('\n>', 16), ('\n> ', 17), ('\n[', 18), ('\na', 19)]


In [None]:
train_features.shape, test_features.shape

((159571, 60000), (153164, 60000))

In [None]:
scores = []
submission = pd.DataFrame.from_dict({'id': test['id']})
for class_name in class_names:
    train_target = train[class_name]
    classifier = LogisticRegression(solver='sag') 
    """
    solver:
    - liblinear : 적은 data에 적합, L1, L2 모두 지원
    - sag, saga : 대용량 data에 적합(sgd기반), sag는 L1, saga는 L2,L1 모두 지원
    - newton-cg, lbfgs (sag, saga) 는 multi-class분류에 사용, lbfgs 성능이 최고로 알려짐. 둘다 L2만 지원
    
    penalty: solver에 따라 l1 또는 l2 만 가능하거나 혹은 둘다 가능하다, C 로 강도를 조절 가능
    
    C: 높을수록 낮은 regularization 효과 (svm), penalty ('l1', 'l2') 의 강도 결정, 
    
    
    """
    

    cv_score = np.mean(cross_val_score(
        classifier, train_features, train_target, cv=3, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(train_features, train_target)
    submission[class_name] = classifier.predict_proba(test_features)[:, 1]
    
    res_probs = classifier.predict_proba(test_features)
    print(res_probs.shape)

print('Total CV score is {}'.format(np.mean(scores)))

# submission.to_csv('submission.csv', index=False)

CV score for class toxic is 0.9787977972677572
(153164, 2)
CV score for class severe_toxic is 0.9887765504366569
(153164, 2)
CV score for class obscene is 0.9903280040183651
(153164, 2)
CV score for class threat is 0.9887839859212524
(153164, 2)
CV score for class insult is 0.9828850973626849
(153164, 2)
CV score for class identity_hate is 0.9831386353878381
(153164, 2)
Total CV score is 0.9854516783990923
