# TFIDF scikit-learn

### author qhduan@memect.co

In [1]:
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, f1_score, make_scorer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

In [2]:
data = pickle.load(open('诗句.dat', 'rb'))

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    data['X'], data['y'],
    test_size=0.2, random_state=0
)

In [4]:
vectorizer = TfidfVectorizer(analyzer='char', lowercase=False, min_df=5, ngram_range=(1, 2))
X_train_vec = vectorizer.fit_transform(X_train)
print('TFIDF dim: {}'.format(len(vectorizer.get_feature_names())))

TFIDF dim: 4941


In [5]:
X_test_vec = vectorizer.transform(X_test)

In [6]:
print(X_train_vec.shape, X_test_vec.shape, y_train.shape, y_test.shape)

(12126, 4941) (3032, 4941) (12126,) (3032,)


In [7]:
def fit(clf, name=None):
    clf.fit(X_train_vec.toarray(), y_train)
    pred_train = clf.predict(X_train_vec.toarray())
    if name is not None:
        print(name)
    print('train accuracy: {}'.format(accuracy_score(y_train, pred_train)))
    print('train recall: {}'.format(recall_score(y_train, pred_train)))
    print('train f1: {}'.format(f1_score(y_train, pred_train)))
    pred_test = clf.predict(X_test_vec.toarray())
    print('test accuracy: {}'.format(accuracy_score(y_test, pred_test)))
    print('test recall: {}'.format(recall_score(y_test, pred_test)))
    print('test f1: {}'.format(f1_score(y_test, pred_test)))

In [8]:
fit(LinearSVC(random_state=0), name='LinearSVC')

LinearSVC
train accuracy: 0.9352630710869206
train recall: 0.9563470066518847
train f1: 0.9461849592102557
test accuracy: 0.8037598944591029
test recall: 0.8417344173441734
test f1: 0.8392326398270737


In [9]:
fit(RandomForestClassifier(random_state=0, n_jobs=-1), name='RandomForestClassifier')

RandomForestClassifier
train accuracy: 0.9904337786574303
train recall: 0.9898835920177383
train f1: 0.9919455631162339
test accuracy: 0.7097625329815304
test recall: 0.7457994579945799
test f1: 0.7577092511013215


In [10]:
fit(KNeighborsClassifier(), name='KNeighborsClassifier')

KNeighborsClassifier
train accuracy: 0.8329209962064984
train recall: 0.8562915742793792
train f1: 0.8591490545050056
test accuracy: 0.7226253298153035
test recall: 0.7642276422764228
test f1: 0.7702813438951105


In [11]:
fit(GaussianNB(), name='GaussianNB')

GaussianNB
train accuracy: 0.7291769750948376
train recall: 0.5507206208425721
train f1: 0.7076210826210826
test accuracy: 0.6220316622691293
test recall: 0.44119241192411923
test f1: 0.5868781542898341
