In [2]:
#%reload_ext autoreload #자동으로 reload 가능하게
#%autoreload 2  #파이썬 코드를 실행하기 전에 항상 모든 모듈 reload
%matplotlib inline 

In [3]:
from matplotlib import pyplot as plt
from matplotlib import rcParams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
from pathlib import Path
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
import seaborn as sns
import warnings

In [4]:
rcParams['figure.figsize']=(16,8)
plt.style.use('fivethirtyeight')
pd.set_option('max_columns',100) #컬럼의 최대 수 지정
pd.set_option('display.precision', 4) # 소수점 4째자리까지 보이게 지정
warnings.simplefilter('ignore') #경고메시지 숨김 

In [5]:
#파일을 불러와서 저장
trn_file = pd.read_csv('../train.csv') 
tst_file = pd.read_csv('../test_x.csv')
sample_file = pd.read_csv('../sample_submission.csv')

#타겟컬럼을 author로 지정
target_col = 'author'
n_fold = 5 #교차검증에서 몇 겹의 폴더로 나눌지 설정, n_split에서 쓰일 것
n_class = 5 #클래스의 개수 
seed = 42 #random state의 값으로 쓰일 것

#trn_file

In [6]:
algo_name = 'lr'
feature_name = 'tfidf'
model_name = f'{algo_name}_{feature_name}'

model_name

'lr_tfidf'

In [7]:
#train data를 가져와서 확인
trn = pd.read_csv('../train.csv', index_col=0)
print(trn.shape)
trn.head()

(54879, 2)


Unnamed: 0_level_0,text,author
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"He was almost choking. There was so much, so m...",3
1,"“Your sister asked for it, I suppose?”",2
2,"She was engaged one day as she walked, in per...",1
3,"The captain was in the porch, keeping himself ...",4
4,"“Have mercy, gentlemen!” odin flung up his han...",3


In [8]:
#test data 가져와서 확인
tst = pd.read_csv('../test_x.csv', index_col=0)
print(tst.shape)
tst.head()

(19617, 1)


Unnamed: 0_level_0,text
index,Unnamed: 1_level_1
0,“Not at all. I think she is one of the most ch...
1,"""No,"" replied he, with sudden consciousness, ""..."
2,As the lady had stated her intention of scream...
3,“And then suddenly in the silence I heard a so...
4,His conviction remained unchanged. So far as I...


In [9]:
from nltk.tokenize import word_tokenize #워드토큰화 위함
from nltk.stem import WordNetLemmatizer #표제어 추출 위함
from nltk.stem.snowball import SnowballStemmer #영어 외의 13개 국가의 언어에 대한 스테머 제공

In [10]:
#train 파일의 5번째 text s에 정의
s = trn.text[4] 
print(s)

“Have mercy, gentlemen!” odin flung up his hands. “Don’t write that, anyway; have some shame. Here I’ve torn my heart asunder before you, and you seize the opportunity and are fingering the wounds in both halves.... Oh, my God!”


In [11]:
#토큰화
tokens = word_tokenize(s)
print(tokens)

['“', 'Have', 'mercy', ',', 'gentlemen', '!', '”', 'odin', 'flung', 'up', 'his', 'hands', '.', '“', 'Don', '’', 't', 'write', 'that', ',', 'anyway', ';', 'have', 'some', 'shame', '.', 'Here', 'I', '’', 've', 'torn', 'my', 'heart', 'asunder', 'before', 'you', ',', 'and', 'you', 'seize', 'the', 'opportunity', 'and', 'are', 'fingering', 'the', 'wounds', 'in', 'both', 'halves', '...', '.', 'Oh', ',', 'my', 'God', '!', '”']


In [12]:
lemmatizer = WordNetLemmatizer()
[lemmatizer.lemmatize(t) for t in tokens] #위 토큰의 표제어 추출

['“',
 'Have',
 'mercy',
 ',',
 'gentleman',
 '!',
 '”',
 'odin',
 'flung',
 'up',
 'his',
 'hand',
 '.',
 '“',
 'Don',
 '’',
 't',
 'write',
 'that',
 ',',
 'anyway',
 ';',
 'have',
 'some',
 'shame',
 '.',
 'Here',
 'I',
 '’',
 've',
 'torn',
 'my',
 'heart',
 'asunder',
 'before',
 'you',
 ',',
 'and',
 'you',
 'seize',
 'the',
 'opportunity',
 'and',
 'are',
 'fingering',
 'the',
 'wound',
 'in',
 'both',
 'half',
 '...',
 '.',
 'Oh',
 ',',
 'my',
 'God',
 '!',
 '”']

In [13]:
stemmer = SnowballStemmer("english") #스테머 언어 영어로 설정
[stemmer.stem(t) for t in tokens] #위 토큰에 대해 어간 추출

['“',
 'have',
 'merci',
 ',',
 'gentlemen',
 '!',
 '”',
 'odin',
 'flung',
 'up',
 'his',
 'hand',
 '.',
 '“',
 'don',
 '’',
 't',
 'write',
 'that',
 ',',
 'anyway',
 ';',
 'have',
 'some',
 'shame',
 '.',
 'here',
 'i',
 '’',
 've',
 'torn',
 'my',
 'heart',
 'asund',
 'befor',
 'you',
 ',',
 'and',
 'you',
 'seiz',
 'the',
 'opportun',
 'and',
 'are',
 'finger',
 'the',
 'wound',
 'in',
 'both',
 'halv',
 '...',
 '.',
 'oh',
 ',',
 'my',
 'god',
 '!',
 '”']

In [14]:
vec = CountVectorizer(tokenizer=word_tokenize, stop_words=stopwords.words('english'), ngram_range=(1, 2), min_df=100)
# 워드 토크나이저 사용, 영어 불용어 사전 이용, 엔그램은 1부터 2개단어까지 사용, 최소 100개 
X_cnt = vec.fit_transform(trn['text']) #train 의 text 카운트벡터화
print(X_cnt.shape) # shape확인

(54879, 2683)


In [15]:
X_cnt[0, :50].todense() #행렬로 반환

matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [16]:
vec = TfidfVectorizer(tokenizer=word_tokenize, stop_words = stopwords.words('english'), ngram_range=(1,3), min_df = 50)
#위와 동일 단, 최소 개수를 50으로 줄이고 엔그램 범위를 1개부터 3개 단어까지 늘림
X = vec.fit_transform(trn['text'])
X_tst = vec.transform(tst['text'])
print(X.shape, X_tst.shape)

(54879, 5899) (19617, 5899)


In [17]:
X[0, :50].todense()

matrix([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0.]])

In [18]:
from sklearn.model_selection import StratifiedKFold #k겹교차검증위한 모델 가져옴 
cv = StratifiedKFold(n_splits = n_fold, shuffle = True, random_state=seed)
#k겹 교차검증 위해 위에서 정의한 n_fold개수로 나누고, 이를 무작위로 섞어 검증하는 객체 생성

In [19]:
y = trn.author.values #train 의 author열의 값 
y.shape
print(y[:50]) #열 값 확인

[3 2 1 4 3 4 3 2 0 4 2 4 0 1 4 2 1 3 4 0 0 3 2 0 1 0 4 1 2 2 3 0 1 2 2 4 0
 4 0 3 2 2 2 0 2 0 4 3 4 4]


In [30]:
p = np.zeros((X.shape[0], n_class)) #X.shape[0]을 class 수 만큼 0으로 구성된 ndarray로 설정
p_tst = np.zeros((X_tst.shape[0], n_class)) #test의 shape[0]을 class 수 만큼 0으로 구성된 ndarray로 설정
for i_cv, (i_trn, i_val) in enumerate(cv.split(X, y), 1): #cv.split(X, y)를 1부터 시작하여 enumerate함수 돌림
    clf = LogisticRegression() #로지스틱 회귀분석 객체 생성
    clf.fit(X[i_trn], y[i_trn]) # train 값 학습
    p[i_val, :] = clf.predict_proba(X[i_val]) #test 값으로 예측
    p_tst += clf.predict_proba(X_tst)/ n_class #예측 값을 집어넣음


In [31]:
print(f'Accuracy(CV):{accuracy_score(y, np.argmax(p, axis=1)) * 100:8.4f}%') #위에서 학습한 것의 예측 정확도
print(f'Log Loss (CV): {log_loss(pd.get_dummies(y), p):8.4f}') # 예측값과 실제값이 얼마나 차이나는지 확인

Accuracy(CV): 76.6687%
Log Loss (CV):   0.6771


In [22]:
feature_file =  f'{feature_name}.csv'
p_val_file =  f'{model_name}.val.csv'
p_tst_file =  f'{model_name}.tst.csv'
sub_file =  f'{model_name}.csv'
np.savetxt(p_val_file, p, fmt='%.6f', delimiter=',')
np.savetxt(p_tst_file, p_tst, fmt='%.6f', delimiter=',')

In [23]:
sub = pd.read_csv('../sample_submission.csv', index_col=0) 
print(sub.shape)
sub.head()

(19617, 5)


Unnamed: 0_level_0,0,1,2,3,4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0
3,0,0,0,0,0
4,0,0,0,0,0


In [32]:
#위에서 예측한 값을 확인
sub[sub.columns] = p_tst
sub.head()

Unnamed: 0_level_0,0,1,2,3,4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.0599,0.5343,0.3188,0.0622,0.0248
1,0.0819,0.8188,0.0031,0.0271,0.0691
2,0.72,0.0314,0.1173,0.0373,0.0941
3,0.0361,0.0035,0.8533,0.0055,0.1017
4,0.3029,0.2413,0.1468,0.1896,0.1193


In [25]:
sub.to_csv(sub_file)