## 원래 코드: MultinomialNB & TfidfVetorizer

In [11]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import numpy as np

categories = ['rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey']
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=categories)
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors_test = vectorizer.transform(newsgroups_test.data)

# model: MultimomialNB
clf = MultinomialNB(alpha=.01)
clf.fit(vectors, newsgroups_train.target)
pred = clf.predict(vectors_test)
#f-1 score
metrics.f1_score(newsgroups_test.target, pred, average='macro')

0.8512083976713682

In [12]:
def show_top10(classifier, vectorizer, categories):
  feature_names = np.asarray(vectorizer.get_feature_names())
  for i, category in enumerate(categories):
    top10 = np.argsort(classifier.coef_[i])[-10:]
    print("%s: %s" % (category, " ".join(feature_names[top10])))

show_top10(clf, vectorizer, newsgroups_train.target_names)

rec.autos: that is in car you it of and to the
rec.motorcycles: for is that in you of it and to the
rec.sport.baseball: for was is that in of he and to the
rec.sport.hockey: you it is he that of and in to the


F-1 score 높이기!

## Try1: Logistic Regresion & TfidfVectorizer

In [9]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import numpy as np

categories = ['rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey']
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=categories)
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors_test = vectorizer.transform(newsgroups_test.data)

# model
clf = LogisticRegression()
clf.fit(vectors, newsgroups_train.target)
pred = clf.predict(vectors_test)
#f-1 score
metrics.f1_score(newsgroups_test.target, pred, average='macro')

0.8400782291758823

#### 0.8512083976713682 -> 0.8400782291758823
#### -> Logistic Regression보다 multinomialNB 사용이 더 좋음

## Try2: 전처리 + MultinomailMB & TfidfVectorizer

In [53]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import numpy as np
import re

categories = ['rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey']
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=categories)

In [66]:
# 데이터 특수문자 정제 함수 정의하기
def clean_text(data):
  clean_text = re.sub('(http|ftp|https)://(?:[-\w.]|(?:%[\da-fA-F{2}]))+','',data)
  clean_text = re.sub('<[^>]*>','',clean_text)
  clean_text = re.sub('[-=+_,.#/\?;:^\\n$@*\"※\n~&%ㆍ!』\\‘’“”{}|\(\)\[\]\<\>`\'》❤♥♡☆★▲◇△▶◆■⊙▦ⓒ●◈·]',' ',clean_text)
  clean_text = clean_text.replace("\t"," ")
  clean_text = clean_text.replace("  "," ")
  return clean_text

In [67]:
# 데이터 정제하기
clean_train = []
for i in newsgroups_train.data:
    clean_train.append(clean_text(i))

clean_test = []
for i in newsgroups_test.data:
    clean_test.append(clean_text(i))

정제된 데이터 확인

In [77]:
clean_train[:5]

['My friend brought a subaru SVX recently  I had drove it for couples times and I think its a great car esp on snow  However when she took it to a local Subaru dealer for a oil change the bill came out to be about 80 dollars  The dealer told us it is because to change the oil filter on a SVX it is necessary to disassemble a metal cover under the engine and that took an hour of labour At first we think we are being ripped off so she phone to a dealer in Toronto but found out the they are charging roughly the same price  So is there any SVX owner out there that has the same problem  And if the oil change story is true then the engineer of Subaru looks pretty stubid to me By the way the car looks great ',
 'Scoring stats for the Swedish NHL players April 5                             Mats Sundin watch   Most points during a season    131 Kent Nilsson Calgary Flames    1980 81  49 82   110 Mats Naslund Montreal Canadiens  1985 86  43 67   109 Mats Sundin Quebec Nordiques   1992 93  43 66  

In [78]:
clean_test[:5]

['  Is there a Chicago Cubs mailing list  If so I d like to join Any help appreciated    ',
 ' We cannot isolate completely Roger but we can make a pretty good estimate I won t claim to split hairs and say that we can really measure who was better Robby Alomar or Carlos Baerga last year the difference is too close to call But Larkin and Lee Clemens and Morris The differences are too great there  In your measure of the game why should a team that has just won it all ever replace a single player Since they are now clearly best  how can they do better Yet every team can always find someplace where they beleive they can improve the team they can always find a player a little better than one they already have  BTW by my definitions the best player is the one who does the most things to help his team win I will allow that this could vary depending on who else is on the team by having aptitudes one team needs more than others  Baseball is a team game but it is made of individual talents It is

In [73]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(clean_train)
vectors_test = vectorizer.transform(clean_test)

# model
clf = MultinomialNB(alpha=0.1) #알파 값 조정
clf.fit(vectors, newsgroups_train.target)
pred = clf.predict(vectors_test)
#f-1 score
metrics.f1_score(newsgroups_test.target, pred, average='macro')

0.8735907881440508

#### 0.8512083976713682 -> 0.8735907881440508
#### 전처리 후 macro f-1 scroe 약 0.022 증가함.