## 201712989 조소연
# 빅데이터 분석 및 응용 개별 프로젝트 과제
#### 데이터 읽어오기

In [231]:
import pandas as pd

df = pd.read_csv('train.csv')


In [232]:
df.head(10) #데이터 샘플

Unnamed: 0,index,text,author
0,0,"He was almost choking. There was so much, so m...",3
1,1,"“Your sister asked for it, I suppose?”",2
2,2,"She was engaged one day as she walked, in per...",1
3,3,"The captain was in the porch, keeping himself ...",4
4,4,"“Have mercy, gentlemen!” odin flung up his han...",3
5,5,"""It was well fought,"" he said, ""and, by my soo...",4
6,6,"Not to pay him was impossible, considering his...",3
7,7,"“A proper figure of a man at-arms,” said the l...",2
8,8,"'You were not here last Sunday night,' he said.",0
9,9,“You must not ask me that!” I cried. “Hell may...,4


In [233]:
df.shape

(54879, 3)

In [222]:
df.author.value_counts()

3    15063
0    13235
2    11554
4     7805
1     7222
Name: author, dtype: int64

### Train, Test 분류
train set과 test set을 3:1로 분리


In [285]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.text,df.author,test_size=0.1,random_state=10)


In [286]:
print(X_train.shape)
print(y_train.shape)

(49391,)
(49391,)


In [287]:
print(X_test.shape)
print(y_test.shape)

(5488,)
(5488,)


### 카운트 기반 특성 추출

In [288]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=2000, min_df=5, max_df=0.5).fit(X_train)

X_train_cv = cv.transform(X_train) 
print('Train set dimension:', X_train_cv.shape) 
X_test_cv = cv.transform(X_test)
print('Test set dimension:', X_test_cv.shape)

Train set dimension: (49391, 2000)
Test set dimension: (5488, 2000)


**max_features, min_df, max_df** 확인하기

In [289]:
for word, count in zip(cv.get_feature_names()[:100], X_train_cv[0].toarray()[0,:100]):
    print(word, ':', count, end=', ')    

_i_ : 0, _that_ : 0, _you_ : 0, able : 0, about : 0, above : 0, abroad : 0, absence : 0, absolutely : 0, absurd : 0, accept : 0, accepted : 0, accident : 0, according : 0, account : 0, acquaintance : 0, acquainted : 0, across : 0, act : 0, action : 0, actually : 0, add : 0, added : 0, address : 0, addressed : 0, addressing : 0, admiration : 0, admit : 0, admitted : 0, advance : 0, advanced : 0, advantage : 0, adventure : 0, advice : 0, affair : 0, affairs : 0, affected : 0, affection : 0, afraid : 0, after : 0, afternoon : 0, afterwards : 0, again : 0, against : 0, age : 0, agitated : 0, agitation : 0, ago : 0, agree : 0, agreeable : 0, agreed : 0, ah : 0, ain : 0, air : 0, alarm : 0, alarmed : 0, alas : 0, alive : 0, all : 0, allow : 0, allowed : 0, almost : 0, alone : 0, along : 0, aloud : 0, already : 0, also : 0, although : 0, altogether : 0, always : 0, am : 0, amazement : 0, america : 0, amiable : 0, among : 0, an : 0, angel : 0, anger : 0, angrily : 0, angry : 0, anne : 0, anoth

## 나이브베이즈를 이용한 문서분류

In [290]:
from sklearn.naive_bayes import MultinomialNB
NB_clf = MultinomialNB()

NB_clf.fit(X_train_cv, y_train)


print('Train set score: {:.3f}'.format(NB_clf.score(X_train_cv, y_train))) #train set에 대한 예측정확도를 확인
print('Test set score: {:.3f}'.format(NB_clf.score(X_test_cv, y_test))) #test set에 대한 예측정확도를 확인

Train set score: 0.667
Test set score: 0.644



#### 예측하기

In [295]:
print('실제작가, 예측한 작가, text')
for content in zip(y_test[:10], NB_clf.predict(X_test_cv[:10]), X_test[:10]):
    print(content)


실제작가, 예측한 작가, text
(3, 3, '“But I thought at the time that you quite guessed,” odin parried with the simplest air.')
(2, 0, '“The treasure is lost,” said Miss odin, calmly.')
(0, 0, "“You don't hear much about them now?” said the spy.")
(0, 2, "'odin,' said odin, abruptly breaking the stillness that prevailed; 'is it worth fifty shiners extra, if it's safely done from the outside?'")
(1, 1, 'The subject was continued no farther; and odin remained thoughtfully silent, till a new object suddenly engaged her attention. She was sitting by odin, and in taking his tea from Mrs. odin, his hand passed so directly before her, as to make a ring, with a plait of hair in the centre, very conspicuous on one of his fingers.')
(3, 3, 'The prince’s tone was so natural and respectful that the general could not possibly suspect him of any insincerity.')
(2, 3, '“Here are three more,” said odin.')
(3, 3, 'She sank helplessly on the bed with her face in the pillows, but a moment later she got up, moved qu

### CountVectorizer 대신 TfidfVectorizer 사용해보기

In [260]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=2000, min_df=5, max_df=0.5).fit(X_train) 
X_train_tfidf = tfidf.transform(X_train) 
X_test_tfidf = tfidf.transform(X_test) 

NB_clf.fit(X_train_tfidf, y_train) 
print('Train set score: {:.3f}'.format(NB_clf.score(X_train_tfidf, y_train))) #train set 예측정확도
print('Test set score: {:.3f}'.format(NB_clf.score(X_test_tfidf, y_test))) #test set 예측정확도


Train set score: 0.667
Test set score: 0.645



### 로지스틱 회귀분석을 이용한 문서 분류

In [264]:
from sklearn.linear_model import LogisticRegression 

#count vector에 대해 regression을 해서 NB와 비교
LR_clf = LogisticRegression() 
LR_clf.fit(X_train_tfidf, y_train)
print('Train set score: {:.3f}'.format(LR_clf.score(X_train_tfidf, y_train)))  
print('Test set score: {:.3f}'.format(LR_clf.score(X_test_tfidf, y_test))) 

Train set score: 0.724
Test set score: 0.686


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [296]:
print('실제작가, 예측한 작가, text')
for content in zip(y_test[:10], LR_clf.predict(X_test_cv[:10]), X_test[:10]):
    print(content)

실제작가, 예측한 작가, text
(3, 3, '“But I thought at the time that you quite guessed,” odin parried with the simplest air.')
(2, 0, '“The treasure is lost,” said Miss odin, calmly.')
(0, 0, "“You don't hear much about them now?” said the spy.")
(0, 2, "'odin,' said odin, abruptly breaking the stillness that prevailed; 'is it worth fifty shiners extra, if it's safely done from the outside?'")
(1, 4, 'The subject was continued no farther; and odin remained thoughtfully silent, till a new object suddenly engaged her attention. She was sitting by odin, and in taking his tea from Mrs. odin, his hand passed so directly before her, as to make a ring, with a plait of hair in the centre, very conspicuous on one of his fingers.')
(3, 1, 'The prince’s tone was so natural and respectful that the general could not possibly suspect him of any insincerity.')
(2, 2, '“Here are three more,” said odin.')
(3, 4, 'She sank helplessly on the bed with her face in the pillows, but a moment later she got up, moved qu

### 릿지 회귀

In [277]:
from sklearn.linear_model import RidgeClassifier

ridge_clf = RidgeClassifier()
ridge_clf.fit(X_train_tfidf, y_train) 
print('Train set score: {:.3f}'.format(ridge_clf.score(X_train_tfidf, y_train)))
print('Test set score: {:.3f}'.format(ridge_clf.score(X_test_tfidf, y_test)))


Train set score: 0.711
Test set score: 0.669


- alpha를 조절하여 다시 해보기

In [278]:
ridge_clf = RidgeClassifier(alpha=1.6) #릿지 분류기 선언
ridge_clf.fit(X_train_tfidf, y_train) #학습

print('Train set score: {:.3f}'.format(ridge_clf.score(X_train_tfidf, y_train)))
print('Test set score: {:.3f}'.format(ridge_clf.score(X_test_tfidf, y_test)))

Train set score: 0.712
Test set score: 0.670


In [279]:
ridge_clf = RidgeClassifier(alpha=1.9)
ridge_clf.fit(X_train_tfidf, y_train) 

print('Train set score: {:.3f}'.format(ridge_clf.score(X_train_tfidf, y_train)))
print('Test set score: {:.3f}'.format(ridge_clf.score(X_test_tfidf, y_test)))

Train set score: 0.711
Test set score: 0.670


### 라쏘 회귀

In [281]:
lasso_clf = LogisticRegression(penalty='l1', solver='liblinear', C=1) 
lasso_clf.fit(X_train_tfidf, y_train)

print('#Train set score: {:.3f}'.format(lasso_clf.score(X_train_tfidf, y_train)))
print('#Test set score: {:.3f}'.format(lasso_clf.score(X_test_tfidf, y_test)))


print('#Used features count: {}'.format(np.sum(lasso_clf.coef_ != 0)), 'out of', X_train_tfidf.shape[1]) 

#Train set score: 0.720
#Test set score: 0.684
#Used features count: 6161 out of 2000


## 성능 높이기

In [282]:
# 필요한 library들을 import
from nltk.corpus import stopwords
cachedStopWords = stopwords.words("english")

from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
import re

RegTok = RegexpTokenizer("[\w']{3,}") # 정규포현식으로 토크나이저를 정의
english_stops = set(stopwords.words('english')) #영어 불용어를 가져옴

def tokenizer(text):
    tokens = RegTok.tokenize(text.lower()) #이렇게 해도 되는지 확인
    # stopwords 제외
    words = [word for word in tokens if (word not in english_stops) and len(word) > 2]
    # portr stemmer 적용
    features = (list(map(lambda token: PorterStemmer().stem(token),words)))
    return features

tfidf = TfidfVectorizer(tokenizer=tokenizer, max_features=2000, min_df=5, max_df=0.5) # 새로 정의한 토크나이저 사용
X_train_tfidf = tfidf.fit_transform(X_train) # train set을 변환
X_test_tfidf = tfidf.transform(X_test) # test set을 변환

#tfidf vector를 이용해서 분류기 학습
LR_clf = LogisticRegression() #분류기 선언
LR_clf.fit(X_train_tfidf, y_train) # train data를 이용하여 분류기를 학습
print('#Train set score: {:.3f}'.format(LR_clf.score(X_train_tfidf, y_train))) # train data에 대한 예측정확도 
print('#Test set score: {:.3f}'.format(LR_clf.score(X_test_tfidf, y_test))) # test data에 대한 예측정확도


#Train set score: 0.693
#Test set score: 0.648


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [283]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(tokenizer=tokenizer).fit(X_train) 

X_train_tfidf = tfidf.transform(X_train) # train set을 변환
print('#Train set dimension:', X_train_tfidf.shape) # 실제로 몇개의 특성이 사용되었는지 확인
X_test_tfidf = tfidf.transform(X_test) # test set을 변환
print('#Test set dimension:', X_test_tfidf.shape)

ridge_clf = RidgeClassifier(alpha=2.4)
ridge_clf.fit(X_train_tfidf, y_train) #학습
print('#Train set score: {:.3f}'.format(ridge_clf.score(X_train_tfidf, y_train)))
print('#Test set score: {:.3f}'.format(ridge_clf.score(X_test_tfidf, y_test)))

NB_clf = MultinomialNB(alpha=0.01) # 분류기 선언
NB_clf.fit(X_train_tfidf, y_train) #train set을 이용하여 분류기(classifier)를 학습
print('#Train set score: {:.3f}'.format(NB_clf.score(X_train_tfidf, y_train))) #train set에 대한 예측정확도를 확인
print('#Test set score: {:.3f}'.format(NB_clf.score(X_test_tfidf, y_test))) #test set에 대한 예측정확도를 확인
 



#Train set dimension: (49391, 21840)
#Test set dimension: (5488, 21840)
#Train set score: 0.816
#Test set score: 0.733
#Train set score: 0.805
#Test set score: 0.735


# 평가

- **나이브베이즈**
<br>Train set score: 0.667
<br>Test set score: 0.644
- **tfidf**
<br>Train set score: 0.667
<br>Test set score: 0.645
- **tfidf 성능조절(불용어 제거, 스테머, Regexp)**
<br>Train set score: 0.693
<br>Test set score: 0.648
- **로지스틱**
<br>Train set score: 0.724
<br>Test set score: 0.686
- **릿지**
<br>Train set score: 0.711
<br>Test set score: 0.669
- **알파를 조절한 릿지**
<br>Train set score: 0.711
<br>Test set score: 0.670
- **라쏘**
<br>Train set score: 0.720
<br>Test set score: 0.684


전체적으로 Train set score가 Test set score보다 높은 것을 알 수 있다.<br>
나이브 베이즈와 tfidf는 Train set score: 0.667,Test set score: 0.644와 Train set score: 0.667, Test set score: 0.645로 성능에 큰 사이는 없다.<br>
tfidf에 불용어 제거, 스테머 적용, Regexp를 적용해 보았을 때에는 오히려 성능이 좋아졌다는 점수로 생각하기 어렵다.<br>
 이어서, 로지스틱분석 Train set score: 0.724, Test set score: 0.686로 전체적 점수는 나이브 베이즈와 , tfidf보다 높지만 Train set과 Test set의 차이가 비교적 크다.<br>
 다음으로 릿지회귀는 Train set score: 0.711, Test set score: 0.669로 로지스틱 회귀보다 그 점수가 떨어지졌다. 알파를 이리저리 조절해보았을 때에도 최적 알파값은 1.9로 Test set score이 0.001정도 높아졌다. 그래도 제일 좋은 모형이라고 생각할 수 없다.<br>
 마지막으로 라쏘 회귀에서는 Train set score: 0.720, Test set score: 0.684이다.
 <br>
 모든 모형을 비교해보았을 때 점수가 제일 높은 로지스틱 회귀가 0.724와 0.686으로 가장 좋은 결과를 얻을 수 있는 모형이라고 할 수 있다. 
 
 