In [34]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import pandas as pd

In [2]:
news_data = fetch_20newsgroups(subset='all', random_state=156)

In [11]:
pd.Series(news_data.target).value_counts().sort_index()

0     799
1     973
2     985
3     982
4     963
5     988
6     975
7     990
8     996
9     994
10    999
11    991
12    984
13    990
14    987
15    997
16    910
17    940
18    775
19    628
dtype: int64

In [16]:
train_news = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), random_state=156)
X_train = train_news.data
Y_train = train_news.target

test_news = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), random_state=156)
X_test = test_news.data
Y_test = test_news.target

print(len(train_news.data), len(test_news.data))

11314 7532


In [22]:
cnt_vect = CountVectorizer()
X_train_cnt_vect = cnt_vect.fit_transform(X_train)
X_test_cnt_vect = cnt_vect.transform(X_test)

In [23]:
# X_train_cnt_vect.toarray()[0].tolist()

In [24]:
X_train_cnt_vect.shape, X_test_cnt_vect.shape

((11314, 101631), (7532, 101631))

In [26]:
lr_clf = LogisticRegression(solver='liblinear')
lr_clf.fit(X_train_cnt_vect, Y_train)
pred = lr_clf.predict(X_test_cnt_vect)
print(accuracy_score(Y_test, pred))

0.6167020711630377


In [28]:
tfidf_vect = TfidfVectorizer()
X_train_tfidf_vect = tfidf_vect.fit_transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

lr_clf = LogisticRegression(solver='liblinear')
lr_clf.fit(X_train_tfidf_vect, Y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
print(accuracy_score(Y_test, pred))

0.6775092936802974


In [29]:
tfidf_vect = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_df=300)
X_train_tfidf_vect = tfidf_vect.fit_transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

lr_clf = LogisticRegression(solver='liblinear')
lr_clf.fit(X_train_tfidf_vect, Y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
print(accuracy_score(Y_test, pred))

0.6901221455124801


In [31]:
params = {'C': [0.01, 0.1, 1, 5, 10]}
grid_cv_lr = GridSearchCV(lr_clf, param_grid=params, cv=3, scoring='accuracy', verbose=1)
grid_cv_lr.fit(X_train_tfidf_vect, Y_train)
grid_cv_lr.best_params_

Fitting 3 folds for each of 5 candidates, totalling 15 fits


{'C': 10}

In [32]:
pred = grid_cv_lr.predict(X_test_tfidf_vect)
accuracy_score(Y_test, pred)

0.7039298990971854

In [36]:
pipeline = Pipeline([('tfidf_vect', TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_df=300)), ('lr_clf', LogisticRegression(solver='liblinear', C=10))])

pipeline.fit(X_train, Y_train)
pred = pipeline.predict(X_test)
accuracy_score(Y_test, pred)

0.7039298990971854

In [39]:
pipeline = Pipeline([('tfidf_vect', TfidfVectorizer(stop_words='english')), ('lr_clf', LogisticRegression(solver='liblinear', C=10))])
params = {
    'tfidf_vect__ngram_range':[(1, 1), (1, 2)],
    'tfidf_vect__max_df':[300, 700],
    # 'lr_clf__C': [1, 5, 10]
}
grid_cv_pipe = GridSearchCV(pipeline, param_grid=params, cv=3, scoring='accuracy', verbose=1)
grid_cv_pipe .fit(X_train, Y_train)

pred = grid_cv_pipe.predict(X_test)
accuracy_score(Y_test, pred)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


0.7019383961763144

In [40]:
grid_cv_pipe.best_params_, grid_cv_pipe.best_score_

({'tfidf_vect__max_df': 700, 'tfidf_vect__ngram_range': (1, 2)},
 0.7550828826229531)