## 라이브러리 import 및 설정

In [71]:
%reload_ext autoreload
%autoreload 2 # 파이썬 코드를 실행하기 전에 항상 모든 모듈을 Reload하라는 의미
%matplotlib inline


In [72]:
from matplotlib import pyplot as plt
from matplotlib import rcParams  #rcParams[]로 전역 글꼴 설정
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss # 예측 정확도 평가를 위해 불러오는 모듈
from sklearn.model_selection import StratifiedKFold #불균형한 분포도를 가진 레이블 데이터를 위한 방식, 특정 레이블 값이 너무 많거나 적은 경우
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer # HashingVectorizer: hashing trick을 사용하여 빠르게 Bow 벡터를 만듦.
import seaborn as sns
import warnings

In [73]:
#plt 옵션 설정
rcParams['figure.figsize'] = (16, 8) 
plt.style.use('fivethirtyeight') 
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)
warnings.simplefilter('ignore')

## 학습데이터 로드

In [74]:
trn_file =  'train.csv'
tst_file =  'test_x.csv'
sample_file = 'sample_submission.csv'

target_col = 'author'
n_fold = 5
n_class = 5
seed = 42

In [58]:
algo_name = 'lr'
feature_name = 'tfidf'
model_name = f'{algo_name}_{feature_name}'

feature_file =  f'{feature_name}.csv'
p_val_file = f'{model_name}.val.csv'
p_tst_file =  f'{model_name}.tst.csv'
sub_file = f'{model_name}.csv'

In [75]:
import pandas as pd
test_x= pd.read_csv('test_x.csv')
df.head(10)

Unnamed: 0,index,text
0,0,“Not at all. I think she is one of the most ch...
1,1,"""No,"" replied he, with sudden consciousness, ""..."
2,2,As the lady had stated her intention of scream...
3,3,“And then suddenly in the silence I heard a so...
4,4,His conviction remained unchanged. So far as I...
5,5,odin crossed the square. In that corner there ...
6,6,In his panic he for some reason shut up the um...
7,7,His nephew left the room without an angry word...
8,8,"My narrative finished, and their questions exh..."
9,9,"The woman, who had hesitated at first, walked ..."


In [76]:
train = pd.read_csv('train.csv')
train.head(10)

Unnamed: 0,index,text,author
0,0,"He was almost choking. There was so much, so m...",3
1,1,"“Your sister asked for it, I suppose?”",2
2,2,"She was engaged one day as she walked, in per...",1
3,3,"The captain was in the porch, keeping himself ...",4
4,4,"“Have mercy, gentlemen!” odin flung up his han...",3
5,5,"""It was well fought,"" he said, ""and, by my soo...",4
6,6,"Not to pay him was impossible, considering his...",3
7,7,"“A proper figure of a man at-arms,” said the l...",2
8,8,"'You were not here last Sunday night,' he said.",0
9,9,“You must not ask me that!” I cried. “Hell may...,4


# Nltk

In [27]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.stem.snowball import SnowballStemmer # 어근을 추출
from nltk.tokenize import RegexpTokenizer

In [28]:
s = train.text[4]
print(s)

“Have mercy, gentlemen!” odin flung up his hands. “Don’t write that, anyway; have some shame. Here I’ve torn my heart asunder before you, and you seize the opportunity and are fingering the wounds in both halves.... Oh, my God!”


#### 토큰화

In [29]:
tokens = word_tokenize(s)
print(tokens)

['“', 'Have', 'mercy', ',', 'gentlemen', '!', '”', 'odin', 'flung', 'up', 'his', 'hands', '.', '“', 'Don', '’', 't', 'write', 'that', ',', 'anyway', ';', 'have', 'some', 'shame', '.', 'Here', 'I', '’', 've', 'torn', 'my', 'heart', 'asunder', 'before', 'you', ',', 'and', 'you', 'seize', 'the', 'opportunity', 'and', 'are', 'fingering', 'the', 'wounds', 'in', 'both', 'halves', '....', 'Oh', ',', 'my', 'God', '!', '”']


#### 정규화

In [30]:
lemmatizer = WordNetLemmatizer()
print([lemmatizer.lemmatize(t) for t in tokens], end='')

['“', 'Have', 'mercy', ',', 'gentleman', '!', '”', 'odin', 'flung', 'up', 'his', 'hand', '.', '“', 'Don', '’', 't', 'write', 'that', ',', 'anyway', ';', 'have', 'some', 'shame', '.', 'Here', 'I', '’', 've', 'torn', 'my', 'heart', 'asunder', 'before', 'you', ',', 'and', 'you', 'seize', 'the', 'opportunity', 'and', 'are', 'fingering', 'the', 'wound', 'in', 'both', 'half', '....', 'Oh', ',', 'my', 'God', '!', '”']

In [40]:
lemmatizer = WordNetLemmatizer()
[lemmatizer.lemmatize(t) for t in tokens]

['“',
 'Have',
 'mercy',
 ',',
 'gentleman',
 '!',
 '”',
 'odin',
 'flung',
 'up',
 'his',
 'hand',
 '.',
 '“',
 'Don',
 '’',
 't',
 'write',
 'that',
 ',',
 'anyway',
 ';',
 'have',
 'some',
 'shame',
 '.',
 'Here',
 'I',
 '’',
 've',
 'torn',
 'my',
 'heart',
 'asunder',
 'before',
 'you',
 ',',
 'and',
 'you',
 'seize',
 'the',
 'opportunity',
 'and',
 'are',
 'fingering',
 'the',
 'wound',
 'in',
 'both',
 'half',
 '....',
 'Oh',
 ',',
 'my',
 'God',
 '!',
 '”']

In [41]:
stemmer = SnowballStemmer("english")
[stemmer.stem(t) for t in tokens]

['“',
 'have',
 'merci',
 ',',
 'gentlemen',
 '!',
 '”',
 'odin',
 'flung',
 'up',
 'his',
 'hand',
 '.',
 '“',
 'don',
 '’',
 't',
 'write',
 'that',
 ',',
 'anyway',
 ';',
 'have',
 'some',
 'shame',
 '.',
 'here',
 'i',
 '’',
 've',
 'torn',
 'my',
 'heart',
 'asund',
 'befor',
 'you',
 ',',
 'and',
 'you',
 'seiz',
 'the',
 'opportun',
 'and',
 'are',
 'finger',
 'the',
 'wound',
 'in',
 'both',
 'halv',
 '....',
 'oh',
 ',',
 'my',
 'god',
 '!',
 '”']

## Bag-of-Words 피처 생성

In [43]:
vec = CountVectorizer(tokenizer=word_tokenize, stop_words=stopwords.words('english'), ngram_range=(1, 2), min_df=100)
X_cnt = vec.fit_transform(train['text'])
print(X_cnt.shape)

(54879, 2685)


In [44]:
X_cnt[0, :50].todense() #X를 dense한 matrix로 변환


matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

*dense한 matrix란?*
    - Word의 특징들을 넣는다고 할 때, NULL이나 0이 아닌 유의한 수치 값을 가지게 하는 것.
      즉, 0에 가까워 무의미한 특성을 가진 상태(sparse)에서 dense한 상태로 바꾸어 주는 것.
      Word2Vec에서 embedding한다고 표현하기도 함.

참고, https://www.buzzvil.com/2016/06/16/word2vec_content_clustering/
<br>
[출처] (4)비정형 데이터 분석 예제(영화 리뷰 분석하기)|작성자 데이터 SH터디



In [50]:
vec = TfidfVectorizer(tokenizer=word_tokenize, stop_words=stopwords.words('english'), ngram_range=(1, 3), min_df=50)
X = vec.fit_transform(train['text'])
X_tst = vec.transform(test_x['text'])
print(X.shape, X_tst.shape)

(54879, 5897) (19617, 5897)


In [51]:
X[0, :50].todense()

matrix([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0.]])

## 로지스틱회귀 모델 학습

In [77]:
cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

*StratifiedFold?<br>
불균형한 분포도를 가진 레이블 데이터를 위한 방식, 특정 레이블 값이 너무 많거나 적은 경우 사용.<br>
왜곡된 레이블 데이터 세트에서는 반드시 StratifiedKFold 이용해 교차 검증해야 한다.<br>
일반적으료 Classification(분류)에서 교차 검증 StratifiedKFold 이용한다.*

[출처] [교차검증] StratifiedKFold vs KFold|작성자 유디니


In [53]:
y = train.author.values
y.shape

(54879,)

In [54]:
p = np.zeros((X.shape[0], n_class))
p_tst = np.zeros((X_tst.shape[0], n_class))
for i_cv, (i_trn, i_val) in enumerate(cv.split(X, y), 1):
    clf = LogisticRegression()
    clf.fit(X[i_trn], y[i_trn])
    p[i_val, :] = clf.predict_proba(X[i_val])
    p_tst += clf.predict_proba(X_tst) / n_class

In [55]:
print(f'Accuracy (CV): {accuracy_score(y, np.argmax(p, axis=1)) * 100:8.4f}%')
print(f'Log Loss (CV): {log_loss(pd.get_dummies(y), p):8.4f}')

Accuracy (CV):  76.6158%
Log Loss (CV):   0.6800


***pd.get_dummies()?***<br>
*one hot encoding의 기능을 하는 것.
즉, 각 변수의 성질을 파악하여 적절하게 숫자로 변환해주는 과정*

In [59]:
np.savetxt(p_val_file, p, fmt='%.6f', delimiter=',')
np.savetxt(p_tst_file, p_tst, fmt='%.6f', delimiter=',')

## 제출 파일 생성

In [66]:
sub = pd.read_csv(sample_file, index_col=0)
print(sub.shape)
sub.head()

(19617, 5)


Unnamed: 0_level_0,0,1,2,3,4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0
3,0,0,0,0,0
4,0,0,0,0,0


In [67]:
sub[sub.columns] = p_tst
sub.head()

Unnamed: 0_level_0,0,1,2,3,4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.0631,0.5302,0.3155,0.0659,0.0253
1,0.0815,0.8202,0.0032,0.0269,0.0682
2,0.7208,0.0319,0.1174,0.0381,0.0918
3,0.0392,0.0036,0.8465,0.0058,0.1049
4,0.3044,0.244,0.145,0.1905,0.1161


In [68]:
sub.to_csv(sub_file)