# Text Classification

## I. Example using Traning set

In [1]:
'''
Loading Train Dataset
'''
import os
import numpy as np

DATA_DIR = 'data/C50/C50train'

articles = []
authors = []

for authorname in os.listdir(DATA_DIR):
    if authorname.startswith('.'):
        continue
    author_dir = os.path.join(DATA_DIR, authorname)
    for filename in os.listdir(author_dir):
        if not filename.endswith('.txt'):
            continue
        filepath = os.path.join(author_dir, filename)
        file = open(filepath, 'r')
        text = file.read()
        articles.append(text)
        authors.append(authorname)
        
authors = np.array(authors)

In [2]:
'''
Document -> Tf-idf
'''

import nltk

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

#vectorizer = CountVectorizer(min_df=2, stop_words='english', tokenizer=nltk.word_tokenize, max_features=3000)
vectorizer = CountVectorizer(min_df=2, tokenizer=nltk.word_tokenize, ngram_range = (1, 3), max_features=80000)
count_features = vectorizer.fit_transform(articles)

#transformer = TfidfTransformer()
transformer = TfidfTransformer(sublinear_tf = True)
tfidf_features = transformer.fit_transform(count_features)

In [3]:
'''
Document Similarity 계산 예제
'''
from sklearn.metrics.pairwise import cosine_similarity

# 문서간 거리를 잰다.
# e.g. 0과 1의 similarity
print(cosine_similarity(tfidf_features[1195], tfidf_features[1]))

# e.g.1195번째 아이와 가장 비슷한 글 TOP4
cosine_similarities = cosine_similarity(tfidf_features[1195], tfidf_features) # 1195와 나머지전체에대한 similarity
related_docs_indices = cosine_similarities.argsort()[0]
print(related_docs_indices[:-5:-1]) # begin:end:stop
print(authors[related_docs_indices[:-5:-1]]) # 맨앞에는 원작자, 그 뒤는 비슷한 글 쓴사람들

[[0.04217813]]
[1195 1159 1189 1199]
['RobinSidel' 'RobinSidel' 'RobinSidel' 'RobinSidel']


## II. Text Classification

In [4]:
'''
Loading Test Dataset
'''

DATA_DIR = 'data/C50/C50test'

test_corpus = []
test_authors = []

for authorname in os.listdir(DATA_DIR):
    if authorname.startswith('.'):
        continue
    author_dir = os.path.join(DATA_DIR, authorname)
    for filename in os.listdir(author_dir):
        if not filename.endswith('.txt'):
            continue
        filepath = os.path.join(author_dir, filename)
        file = open(filepath, 'r')
        text = file.read()
        test_corpus.append(text)
        test_authors.append(authorname)
        
test_authors = np.array(test_authors)

In [5]:
'''
Label Encoding
'''

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
label_encoder = LabelEncoder()

labels = label_encoder.fit_transform(authors)
test_labels = label_encoder.transform(test_authors)

In [6]:
'''
Train Naive Bayesian Classifier
'''
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB(alpha=0.01).fit(tfidf_features, labels)

In [7]:
'''
Test Classifier with All Test Dataset
'''
test_counts = vectorizer.transform(test_corpus)
test_tfidf = transformer.transform(test_counts)
classifier.score(test_tfidf, test_labels)

0.7268

In [8]:
'''
Important Features
'''
n = 20
feature_names = vectorizer.get_feature_names()
topn = sorted(zip(classifier.coef_[0], feature_names))[-n:]
bottomn = sorted(zip(classifier.coef_[0], feature_names))[:n]

print("Top")
for coef, feature in topn:
    print(feature, coef)
    
print("\nBottom")
for coef, feature in bottomn:
    print(feature, coef)

Top
that -7.589123179817156
association -7.550885470559749
'' -7.513471363975897
`` -7.508719965946974
said -7.498146462752036
in -7.473058024567271
federal -7.468570594133816
and -7.422707727197643
banks -7.39160737414375
online -7.3431551980342284
a -7.340998695467735
congress -7.287753464976623
to -7.271270185164254
of -7.247128806142091
encryption -7.236134611632388
. -7.136179974898932
, -7.079580129827989
the -7.016485992773483
the internet -6.990050008164698
internet -6.80142025904969

Bottom
! -12.147437463428275
! service -12.147437463428275
$ 0.07 -12.147437463428275
$ 0.08 -12.147437463428275
$ 0.10 -12.147437463428275
$ 0.10 to -12.147437463428275
$ 0.14 -12.147437463428275
$ 0.15 -12.147437463428275
$ 0.17 -12.147437463428275
$ 0.18 -12.147437463428275
$ 0.19 -12.147437463428275
$ 0.20 -12.147437463428275
$ 0.21 -12.147437463428275
$ 0.22 -12.147437463428275
$ 0.29 -12.147437463428275
$ 0.30 -12.147437463428275
$ 0.33 -12.147437463428275
$ 0.35 -12.147437463428275
$ 0.36 -

In [9]:
'''
Confusion Matrix
'''
predicted_labels = classifier.predict(test_tfidf)

from sklearn.metrics import confusion_matrix
print(confusion_matrix(test_labels, predicted_labels))

[[47  0  0 ...  0  0  0]
 [ 0 30  0 ...  0  0  0]
 [ 0  0 24 ...  0  0  0]
 ...
 [ 0  0  0 ... 46  0  0]
 [ 0  0  0 ...  0 25  0]
 [ 0  0  0 ...  0  0 21]]


### Toy Example

In [10]:
'''
Toy Example
'''
sample_doc = ['Strong business fundamentals', 'Solid growth in the Commonwealth Bank']
sample_count = vectorizer.transform(sample_doc)
sample_tfidf = transformer.transform(sample_count)
predicted = classifier.predict(sample_tfidf)
print(predicted)
print(label_encoder.inverse_transform(predicted))
classifier.predict_proba(sample_tfidf)

[11  4]
['GrahamEarnshaw' 'BernardHickey']


array([[6.30545512e-03, 2.08021440e-02, 2.74904530e-02, 3.42058319e-03,
        4.97121099e-02, 6.66911245e-02, 6.71803535e-03, 4.02381960e-03,
        6.77594610e-03, 6.85772783e-02, 2.18180202e-02, 1.03771736e-01,
        1.59297965e-02, 4.95393677e-03, 3.35593147e-03, 5.31474840e-03,
        7.11464870e-03, 7.66082238e-03, 3.12675117e-02, 9.14398145e-03,
        1.85950499e-03, 2.72297626e-02, 6.35627096e-02, 6.72767580e-03,
        5.77898393e-03, 3.02125576e-02, 3.56480992e-02, 2.21146442e-02,
        2.86874946e-03, 2.90749496e-03, 7.69847728e-03, 3.10817611e-02,
        4.06042380e-03, 3.13534211e-02, 1.64056376e-02, 3.01760157e-02,
        6.84779345e-03, 5.36348828e-03, 5.07956987e-03, 2.30718438e-02,
        4.28481157e-03, 4.77377001e-02, 2.11853692e-02, 2.77124247e-03,
        7.96276706e-03, 4.74721141e-03, 5.82312416e-02, 1.18517477e-02,
        5.85237864e-03, 4.47883376e-03],
       [1.93298861e-03, 1.71263229e-02, 4.99039506e-03, 4.73034748e-04,
        5.92312020e-01,