This project aims to classify 20newsgroup using bag-of-words model, and we represent each news as a vector of TfIdf values.

In [1]:
import numpy as np
import nltk
import string
import numpy as np
from nltk.corpus import movie_reviews
import matplotlib.pyplot as plt
%matplotlib inline

# 1.Fetch Data

In [2]:
first = movie_reviews.fileids()[0]
movie_reviews.open(first).read()[:400]

'plot : two teen couples go to a church party , drink and then drive . \nthey get into an accident . \none of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . \nwhat\'s the deal ? \nwatch the movie and " sorta " find out . . . \ncritique : a mind-fuck movie for the teen generation that touches on a very cool idea , but presents it in a very bad package . \nwhich i'

In [3]:
print('积极评论数量：', len(movie_reviews.fileids('pos')))
print('负面评论数量：', len(movie_reviews.fileids('neg')))

积极评论数量： 1000
负面评论数量： 1000


In [4]:
#读取每篇评论及其对应的标签
document_label_pairs = [(movie_reviews.raw(fileid), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

In [5]:
#随机打乱次序
np.random.shuffle(document_label_pairs)
documents, labels = list(zip(*document_label_pairs))

## Divide data into training and testing sets

In [6]:
from sklearn.model_selection import train_test_split
doc_train, doc_test, label_train, label_test = train_test_split(documents, labels, test_size=0.3, random_state=111)

## Preprocess data

Remove punctuation, lemmatize words and extract features.

In [50]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re
punctuations = string.punctuation
class textVectorizer:
    '''
    Clean texts and turn them into vectors
    Args:
    train_data: a list of texts(strings)
    test_data: a list of texts(strings)
    '''
    def __init__(self, train_data, vectorizer, is_lemmatize=True):
        self._train_data = train_data
        self._vectorizer = vectorizer
        self._is_lemmatize = is_lemmatize
    
    def proceed(self):
        '''Execute preprocessing and intialize vectorizer'''
        self._train_data = self._preprocess(self._train_data)
        if self._is_lemmatize:
            self._train_data = self._lemmatize(self._train_data)
        X_train = self._vectorizer.fit_transform(self._train_data)
        return X_train
    
    def vectorize(self, texts):
        '''
        Vectorize input texts
        Args:
        texts: list
        '''
        texts = self._preprocess(texts)
        if self._is_lemmatize:
            processed_vectors = self._lemmatize(texts)
        vectors = self._vectorizer.transform(processed_vectors)
        return vectors

    
    def _preProcessor(self, s):
        #remove punctuation
        s = re.sub('['+string.punctuation+']', ' ', s)
        #remove digits
        s = re.sub('['+string.digits+']', ' ', s)
        #remove foreign characters
        s = re.sub('[^a-zA-Z]', ' ', s)
        #remove line ends
        s = re.sub('\n', ' ', s)
        #turn to lower case
        s = s.lower()
        s = re.sub('[ ]+',' ', s)
        s = s.rstrip()
        return s
    
    def _preprocess(self, texts):
        '''Remove punctuations'''
        processed_texts = [self._preProcessor(item) for item in texts]
        return processed_texts
    
    def _lemmatize(self, texts):
        '''Lemmatize words into original forms'''
        lemmatizer = WordNetLemmatizer()
        texts_lemmatized = []
        for text in texts:
            lem_data = [lemmatizer.lemmatize(word.strip()) for word in word_tokenize(text)]
            data = ' '.join(lem_data)
            texts_lemmatized.append(data)
        return texts_lemmatized
   

In [45]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.feature_selection.univariate_selection import chi2, SelectKBest
selectKBest = SelectKBest(chi2, 10000)
vectorizer1 = CountVectorizer(ngram_range=(1, 2), stop_words='english')
vectorizer2 = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')
vectorizer3 = HashingVectorizer(ngram_range=(1, 2), non_negative=True, stop_words='english')
vectorizer4 = CountVectorizer(stop_words='english')
vectorizer5 = TfidfVectorizer(stop_words='english')
vectorizer6 = HashingVectorizer(non_negative=True, stop_words='english')

In [42]:
#Specify a classification model
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, alpha=1e-4,
                    solver='adam', verbose=False, tol=1e-5, random_state=1,
                    learning_rate_init=.17)
nb = MultinomialNB(0.01)
lr = LogisticRegression(C=0.5)
rf =  RandomForestClassifier(n_estimators=100, n_jobs=4)

In [48]:
import time
from sklearn.metrics import f1_score, accuracy_score
def train_test(clf, X_train=None, X_test=None):
    start = time.time()
    #训练
    clf.fit(X_train, label_train)
    end = time.time()
    print('训练时间：{:.3f}'.format(end-start))
    #预测
    preds = clf.predict(X_train)
    #计算准准确率
    #Micro F1
    print('Training accuracy: {:.3f}'.format(accuracy_score(label_train, preds)))
    preds = clf.predict(X_test)
    #计算准准确率
    #Micro F1
    print('Testing Accuracy: {:.3f}'.format(accuracy_score(label_test, preds)))
    #print('Macro F1: {:.3f}'.format(f1_score(label_test, preds, average='macro')))

In [49]:
vectorizers = [vectorizer1, vectorizer2, vectorizer3, vectorizer4, vectorizer5, vectorizer6]
#Train the model with several classifiers
models = [lr, rf, nb]
for vectorizer in vectorizers:
    print('*'*30)
    print(vectorizer.__class__)
    tv = textVectorizer(doc_train, vectorizer)
    X_train = tv.proceed()
    X_test = tv.vectorize(doc_test)
    X_train = selectKBest.fit_transform(X_train, label_train)
    X_test = selectKBest.transform(X_test)
    for model in models:      
        print(model.__class__)
        train_test(model, X_train, X_test)

******************************
<class 'sklearn.feature_extraction.text.CountVectorizer'>
<class 'sklearn.linear_model.logistic.LogisticRegression'>
训练时间：0.076
Training accuracy: 1.000
Testing Accuracy: 0.835
<class 'sklearn.ensemble.forest.RandomForestClassifier'>
训练时间：0.416
Training accuracy: 1.000
Testing Accuracy: 0.795
<class 'sklearn.naive_bayes.MultinomialNB'>
训练时间：0.007
Training accuracy: 0.996
Testing Accuracy: 0.762
******************************
<class 'sklearn.feature_extraction.text.TfidfVectorizer'>
<class 'sklearn.linear_model.logistic.LogisticRegression'>
训练时间：0.010
Training accuracy: 0.939
Testing Accuracy: 0.777
<class 'sklearn.ensemble.forest.RandomForestClassifier'>
训练时间：0.416
Training accuracy: 1.000
Testing Accuracy: 0.807
<class 'sklearn.naive_bayes.MultinomialNB'>
训练时间：0.004
Training accuracy: 0.996
Testing Accuracy: 0.765
******************************
<class 'sklearn.feature_extraction.text.HashingVectorizer'>




<class 'sklearn.linear_model.logistic.LogisticRegression'>
训练时间：0.019
Training accuracy: 0.854
Testing Accuracy: 0.752
<class 'sklearn.ensemble.forest.RandomForestClassifier'>
训练时间：0.432
Training accuracy: 1.000
Testing Accuracy: 0.798
<class 'sklearn.naive_bayes.MultinomialNB'>
训练时间：0.006
Training accuracy: 0.994
Testing Accuracy: 0.763
******************************
<class 'sklearn.feature_extraction.text.CountVectorizer'>
<class 'sklearn.linear_model.logistic.LogisticRegression'>
训练时间：0.087
Training accuracy: 1.000
Testing Accuracy: 0.837
<class 'sklearn.ensemble.forest.RandomForestClassifier'>
训练时间：0.417
Training accuracy: 1.000
Testing Accuracy: 0.780
<class 'sklearn.naive_bayes.MultinomialNB'>
训练时间：0.008
Training accuracy: 0.986
Testing Accuracy: 0.748
******************************
<class 'sklearn.feature_extraction.text.TfidfVectorizer'>
<class 'sklearn.linear_model.logistic.LogisticRegression'>
训练时间：0.012
Training accuracy: 0.942
Testing Accuracy: 0.777
<class 'sklearn.ensembl



<class 'sklearn.linear_model.logistic.LogisticRegression'>
训练时间：0.018
Training accuracy: 0.870
Testing Accuracy: 0.778
<class 'sklearn.ensemble.forest.RandomForestClassifier'>
训练时间：0.420
Training accuracy: 1.000
Testing Accuracy: 0.802
<class 'sklearn.naive_bayes.MultinomialNB'>
训练时间：0.006
Training accuracy: 0.978
Testing Accuracy: 0.777


### Tune Parameters

We can select naive bayesian classifier as the final model, and with gridsearch we can figure out which 'alpha' can optimize the performance of the model on testing data set.

In [40]:
from sklearn.model_selection import GridSearchCV 
parameters = {'C': [2, 1, 0.5, 0.2, 0.1, 0.05]}
lr = LogisticRegression()
gs = GridSearchCV(estimator=lr, param_grid=parameters, cv=5)
train_test(gs)

训练时间：13.833
Training accuracy: 0.960
Testing Accuracy: 0.817
