# HASH scikit-learn

### author qhduan@memect.co

In [1]:
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, make_scorer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

In [2]:
data = pickle.load(open('诗句.dat', 'rb'))

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    data['X'], data['y'],
    test_size=0.2, random_state=0
)

In [4]:
vectorizer = HashingVectorizer(
    n_features=256, analyzer='char',
    lowercase=False, ngram_range=(1, 2)
)
X_train_vec = vectorizer.fit_transform(X_train)

In [5]:
X_test_vec = vectorizer.transform(X_test)

In [6]:
print(X_train_vec.shape, X_test_vec.shape, y_train.shape, y_test.shape)

(12126, 256) (3032, 256) (12126,) (3032,)


In [7]:
def fit(clf, name=None):
    clf.fit(X_train_vec.toarray(), y_train)
    pred_train = clf.predict(X_train_vec.toarray())
    if name is not None:
        print(name)
    print('train precision: {}'.format(precision_score(y_train, pred_train)))
    print('train recall: {}'.format(recall_score(y_train, pred_train)))
    print('train f1: {}'.format(f1_score(y_train, pred_train)))
    pred_test = clf.predict(X_test_vec.toarray())
    print('test precision: {}'.format(precision_score(y_test, pred_test)))
    print('test recall: {}'.format(recall_score(y_test, pred_test)))
    print('test f1: {}'.format(f1_score(y_test, pred_test)))

In [8]:
fit(LinearSVC(random_state=0), name='LinearSVC')

LinearSVC
train precision: 0.6686092121483805
train recall: 0.8267738359201774
train f1: 0.7393270958547618
test precision: 0.6665188470066519
test recall: 0.8146341463414634
test f1: 0.7331707317073172


In [9]:
fit(RandomForestClassifier(random_state=0, n_jobs=-1), name='RandomForestClassifier')

RandomForestClassifier
train precision: 0.9920745272525028
train recall: 0.9887749445676275
train f1: 0.9904219877845641
test precision: 0.6688668866886689
test recall: 0.6590785907859079
test f1: 0.6639366639366638


In [10]:
fit(KNeighborsClassifier(), name='KNeighborsClassifier')

KNeighborsClassifier
train precision: 0.8077030033370411
train recall: 0.8050166297117517
train f1: 0.8063575791227097
test precision: 0.6836065573770492
test recall: 0.6780487804878049
test f1: 0.6808163265306123


In [11]:
fit(GaussianNB(), name='GaussianNB')

GaussianNB
train precision: 0.715600415615259
train recall: 0.6680986696230599
train f1: 0.6910341861965169
test precision: 0.7080856123662307
test recall: 0.6455284552845528
test f1: 0.6753614970229657
