> Text 분류 과제

강의 실습 교안의 기계학습을 이용한 newsgroup 분류 중 분류 카테고리를 아래의 4종류로 바꾸어서 colab에서 실습해보고,   
  
'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey'  
  
다른 알고리즘이나 tf vector, 전처리등을 이용하여 macro F1-score를 높여 보세요.

교안에 나온대로 header, footer, quote은 제외하고 실험하세요.

> Baseline

In [67]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

# 카테고리 변경
categories = ['rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey']
# header, footer, quote 필터링
newsgroups_train = fetch_20newsgroups(subset = 'train',
                                      remove = ('headers','footers','quotes'),
                                      categories = categories)

vectorizer = TfidfVectorizer(max_features = 1000)
vectors = vectorizer.fit_transform(newsgroups_train.data)

In [68]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

# header, footer, quote 필터링
newsgroups_test = fetch_20newsgroups(subset = 'test',
                                     remove = ('headers','footers','quotes'),
                                     categories = categories)
vectors_test = vectorizer.transform(newsgroups_test.data)

In [69]:
# Naive Bayes
clf = MultinomialNB(alpha = .01)
clf.fit(vectors,newsgroups_train.target)
pred = clf.predict(vectors_test)

metrics.f1_score(newsgroups_test.target,pred,average = 'macro')

0.7966588763644664

> 전처리, Tf vector, 다른 알고리즘

In [15]:
import nltk
nltk.download('all',quiet = True)
from nltk.corpus import names
from nltk.stem import WordNetLemmatizer

all_names = set(names.words())
lemmatizer = WordNetLemmatizer()

def clean_text(docs):
  cleaned_docs = []
  for doc in docs:
    lemmatized_list = [ lemmatizer.lemmatize(word.lower())
    for word in doc.split()
    if word.isalpha() and word not in all_names]
    cleaned_docs.append(' '.join(lemmatized_list))
  return cleaned_docs

In [70]:
# 전처리
cleaned_train = clean_text(newsgroups_train.data)
cleaned_test = clean_text(newsgroups_test.data)

# Tf vector
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(cleaned_train)
vectors_test = vectorizer.transform(cleaned_test)

In [73]:
# SVC
from sklearn.svm import SVC
import itertools

parameters = {'C':(100,1e3,1e4,1e5),
              'gamma':(1e-8,1e-7,1e-6,1e-5)}
results = []

for c,gam in itertools.product(parameters['C'],parameters['gamma']):
  clf = svc = SVC(kernel = 'rbf',C = c,gamma = gam)
  clf.fit(vectors,newsgroups_train.target)
  pred = clf.predict(vectors_test)
  score = metrics.f1_score(newsgroups_test.target,pred,average = 'macro')
  results.append((c,gam,score))

results = pd.DataFrame(results)
results.columns = ['C','gamma','F1-score']
results.sort_values(by = ['F1-score'],ascending = False,axis = 0,inplace = True)

In [74]:
results

Unnamed: 0,C,gamma,F1-score
15,100000.0,1e-05,0.753743
11,10000.0,1e-05,0.751939
14,100000.0,1e-06,0.749392
0,100.0,1e-08,0.100302
1,100.0,1e-07,0.100302
2,100.0,1e-06,0.100302
3,100.0,1e-05,0.100302
4,1000.0,1e-08,0.100302
5,1000.0,1e-07,0.100302
6,1000.0,1e-06,0.100302


In [89]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.model_selection import KFold,StratifiedKFold

param_grid = {
    'n_estimators':[200,500,800],
    'max_depth':[2,4,6],
    'min_samples_split':[4,12,20],
    'min_samples_leaf':[2,6,10]
}

rf_grid = GridSearchCV(estimator = RFC(random_state = 123), param_grid = param_grid,
                       cv = KFold(n_splits = 5, random_state = 42), 
                       n_jobs = -1, verbose = 2)
rf_grid.fit(vectors, newsgroups_train.target)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   35.3s
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 361 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed: 10.6min finished


GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=False),
             error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_sco

In [90]:
rf_grid.best_params_

{'max_depth': 6,
 'min_samples_leaf': 6,
 'min_samples_split': 20,
 'n_estimators': 800}

In [91]:
rf_grid.best_score_

0.7316623246756665

In [97]:
clf = RFC(n_estimators = 800,max_depth = 6,min_samples_split = 20,min_samples_leaf = 6,random_state = 123,n_jobs = -1)
clf.fit(vectors,newsgroups_train.target)
pred = clf.predict(vectors_test)
metrics.f1_score(newsgroups_test.target,pred,average = 'macro')

0.7575398900983039

In [112]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

C = [0.001,0.01,0.1,0.5,1,10]

results = []

for c in C:
  clf = LogisticRegression(solver = 'liblinear',C = c,random_state = 123)
  clf.fit(vectors,newsgroups_train.target)
  pred = clf.predict(vectors_test)
  score = metrics.f1_score(newsgroups_test.target,pred,average = 'macro')
  results.append((a,score))

results = pd.DataFrame(results)
results.columns = ['C','F1-score']
results.sort_values(by = ['F1-score'],ascending = False,axis = 0,inplace = True)

In [113]:
results

Unnamed: 0,C,F1-score
4,1,0.783642
5,1,0.77834
3,1,0.774504
2,1,0.750545
1,1,0.705249
0,1,0.492251


In [98]:
import pandas as pd

# Naive Bayes
alpha = [0.001,0.01,0.1,0.5,1]
results = []

for a in alpha:
  clf = MultinomialNB(alpha = a)
  clf.fit(vectors,newsgroups_train.target)
  pred = clf.predict(vectors_test)
  score = metrics.f1_score(newsgroups_test.target,pred,average = 'macro')
  results.append((a,score))

results = pd.DataFrame(results)
results.columns = ['alpha','F1-score']
results.sort_values(by = ['F1-score'],ascending = False,axis = 0,inplace = True)

In [99]:
results

Unnamed: 0,alpha,F1-score
3,0.5,0.818373
2,0.1,0.812895
4,1.0,0.812784
1,0.01,0.799572
0,0.001,0.785508


Naive Bayes with alpha = 0.5 모델에서의 성능이 가장 좋았다.