In [58]:
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS, \
                                            TfidfVectorizer
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import LatentDirichletAllocation

import numpy as np
import mglearn

import spacy
import nltk

import re

import matplotlib.pyplot as plt

## Новый раздел

In [6]:
reviews_train = load_files('train/')

text_train, y_train = reviews_train.data, reviews_train.target

print(type(text_train))
print(len(text_train))
print(text_train[1])

<class 'list'>
75000
b"Amount of disappointment I am getting these days seeing movies like Partner, Jhoom Barabar and now, Heyy Babyy is gonna end my habit of seeing first day shows.<br /><br />The movie is an utter disappointment because it had the potential to become a laugh riot only if the d\xc3\xa9butant director, Sajid Khan hadn't tried too many things. Only saving grace in the movie were the last thirty minutes, which were seriously funny elsewhere the movie fails miserably. First half was desperately been tried to look funny but wasn't. Next 45 minutes were emotional and looked totally artificial and illogical.<br /><br />OK, when you are out for a movie like this you don't expect much logic but all the flaws tend to appear when you don't enjoy the movie and thats the case with Heyy Babyy. Acting is good but thats not enough to keep one interested.<br /><br />For the positives, you can take hot actresses, last 30 minutes, some comic scenes, good acting by the lead cast and the 

In [10]:
text_train = [doc.replace(b'<br />', b' ') for doc in text_train]

np.bincount(y_train)

array([12500, 12500, 50000])

In [11]:
reviews_test = load_files('test/')

text_test, y_test = reviews_test.data, reviews_test.target

print(len(text_test))
print(np.bincount(y_test))

text_test = [doc.replace(b'<br />', b' ') for doc in text_test]

25000
[12500 12500]


## Новый раздел

In [13]:
bards_words = [
    'The fool doth think he is wise',
    'but the wise man knows himself to be a fool'
]

In [15]:
vect = CountVectorizer()
vect.fit(bards_words)

print(len(vect.vocabulary_))
print(vect.vocabulary)

13
None


In [16]:
bag_of_words = vect.transform(bards_words)
repr(bag_of_words)

"<2x13 sparse matrix of type '<class 'numpy.int64'>'\n\twith 16 stored elements in Compressed Sparse Row format>"

In [17]:
bag_of_words.toarray()

array([[0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1],
       [1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1]])

## Новый раздел

In [18]:
vect = CountVectorizer()
vect.fit(text_train)

X_train = vect.transform(text_train)

repr(X_train)

"<75000x124255 sparse matrix of type '<class 'numpy.int64'>'\n\twith 10315542 stored elements in Compressed Sparse Row format>"

In [20]:
feature_names = vect.get_feature_names()

print(len(feature_names))
print(feature_names[:5])
print(feature_names[20010:20015])
print(feature_names)

124255
['00', '000', '0000', '0000000000000000000000000000000001', '0000000000001']
['cheapen', 'cheapened', 'cheapening', 'cheapens', 'cheaper']


In [27]:
scores = cross_val_score(LogisticRegression(), X_train, y_train, cv=5)

np.mean(scores)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.6729999999999999

In [30]:
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10]
}

grid = GridSearchCV(LogisticRegression(max_iter=100000), param_grid, cv=5)
grid.fit(X_train, y_train)

print(grid.best_score_)
print(grid.best_params_)

In [31]:
vect = CountVectorizer(min_df=5)
vect.fit(text_train)

repr(X_train)

In [32]:
feature_names = vect.get_feature_names()

print(feature_names[:50])
print(feature_names[20010:20030])
print(feature_names[::700])

In [33]:
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train, y_train)

print(grid.best_score_)

## Стоп-слова

In [None]:
print(len(ENGLISH_STOP_WORDS))
print(list(ENGLISH_STOP_WORDS)[::10])

In [None]:
vect = CountVectorizer(min_df=5, stop_words='english')
vect.fit(text_train)

print(repr(X_train))

In [None]:
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train, y_train)

grid.best_score_

## Новый пункт

In [36]:
pipe = make_pipeline(TfidfVectorizer(min_df=5, norm=None), LogisticRegression())

param_grid = {
    'logisticregression__C': [0.001, 0.01, 0.1, 1, 10]
}

grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(text_train, y_train)

grid.best_score_

In [38]:
vectorizer = grid.best_estimator_.named_steps['tfidfvectorizer']
X_train = vectorizer.transform(text_train)

max_value = X_train.max(axis=0).toarray().ravel()
sorted_by_tfidf = max_value.argsort()

feature_names = np.array(vectorizer.get_feature_names())

print(feature_names[sorted_by_tfidf[:20]])
print(feature_names[sorted_by_tfidf[-20:]])

In [None]:
sorted_by_idf = np.argsort(vectorizer.idf_)

print(feature_names[sorted_by_idf[:100]])

## Новый пункт


In [41]:
mglearn.tools.visualize_coefficients(
    grid.best_estimator_.name_steps['logisticregression'].coef_,
    feature_names,
    n_top_features=40
)

## Новый пункт

In [42]:
print(bards_words)

['The fool doth think he is wise', 'but the wise man knows himself to be a fool']


In [44]:
cv = CountVectorizer(ngram_range=(1, 1))
cv.fit(bards_words)

print(len(cv.vocabulary_))
print(cv.get_feature_names())

13
['be', 'but', 'doth', 'fool', 'he', 'himself', 'is', 'knows', 'man', 'the', 'think', 'to', 'wise']


In [45]:
cv = CountVectorizer(ngram_range=(2, 2))
cv.fit(bards_words)

print(len(cv.vocabulary_))
print(cv.get_feature_names())

14
['be fool', 'but the', 'doth think', 'fool doth', 'he is', 'himself to', 'is wise', 'knows himself', 'man knows', 'the fool', 'the wise', 'think he', 'to be', 'wise man']


In [46]:
print(cv.transform(bards_words).toarray())

[[0 0 1 1 1 0 1 0 0 1 0 1 0 0]
 [1 1 0 0 0 1 0 1 1 0 1 0 1 1]]


In [47]:
cv = CountVectorizer(ngram_range=(1, 3))
cv.fit(bards_words)

print(len(cv.vocabulary_))
print(cv.get_feature_names())

39
['be', 'be fool', 'but', 'but the', 'but the wise', 'doth', 'doth think', 'doth think he', 'fool', 'fool doth', 'fool doth think', 'he', 'he is', 'he is wise', 'himself', 'himself to', 'himself to be', 'is', 'is wise', 'knows', 'knows himself', 'knows himself to', 'man', 'man knows', 'man knows himself', 'the', 'the fool', 'the fool doth', 'the wise', 'the wise man', 'think', 'think he', 'think he is', 'to', 'to be', 'to be fool', 'wise', 'wise man', 'wise man knows']


In [None]:
pipe = make_pipeline(TfidfVectorizer(min_df=5), LogisticRegression())

param_grid = {
    'logisticregression__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'tfidfvectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)]
}

grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(text_train, y_train)

print(grid.best_score_)
print(grid.best_params_)

In [50]:
scores = grid.cv_results_['mean_test_score'].reshape(-1, 3).T

heatmap = mglearn.tools.heatmap(
    scores,
    xlabel='C',
    ylabel='ngram_range',
    cmap='viridis',
    fmt='%.3f',
    xticklabels=param_grid['logisticregression__C'],
    yticklabels=param_grid['tfidfvectorizer__ngram_range']
)

plt.colorbar(heatmap)

In [None]:
vect = grid.best_estimator_.named_steps['tfidfvectorizer']
feature_names = np.array(vect.get_feature_names())
coef = grid.best_estimator_.named_steps['logisticregression'].coef_

mglearn.tools.visualize_coefficients(coef, feature_names, n_top_features=40)

In [None]:
mask = np.array([len(feature.split(' ')) for feature in feature_names]) == 3
mglearn.tools.visualize_coefficients(
    coef.ravel()[mask],
    feature_names[mask], n_top_features=40
)

## Новый пункт

In [53]:
en_nlp = spacy.load('en')
stemmer = nltk.stem.PorterStemmer()

def compare_normalization(doc):
    doc_spacy = en_nlp(doc)
    print([token.lemma_ for token in doc_spacy])
    print([stemmer.stem(token.norm_.lower()) for token in doc_spacy])

In [55]:
regexp = re.compile('(?u)\\b\\w\\w+\\b')

en_nlp = spacy.load('en')
old_tokenizer = en_nlp.tokenizer

en_nlp.tokenizer = lambda string: old_tokenizer.tokens_from_list(regexp.findall(string))

def custom_tokenizer(document):
    doc_spacy = en_nlp(document, entity=False, parse=False)
    return [token.lemma_ for token in doc_spacy]

lemma_vect = CountVectorizer(tokenizer=custom_tokenizer, min_df=5)

In [None]:
X_train_lemma = lemma_vect.fit_transform(text_train)

vect = CountVectorizer(min_df=5)
vect.fit(text_train)

X_train = vect.transform(text_train)

In [None]:
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10]
}

cv = StratifiedShuffleSplit(
    n_iter=5,
    test_size=0.99,
    train_size=0.01,
    random_state=0
)

grid = GridSearchCV(LogisticRegression(), param_grid, cv=cv)
grid.fit(X_train, y_train)

print(grid.best_score_)

grid.fit(X_train_lemma, y_train)

print(grid.best_score_)

## Новый


In [None]:
vect = CountVectorizer(max_features=10000, max_df=.15)
X = vect.fit_transform(text_train)

In [None]:
lda = LatentDirichletAllocation(
    n_topics=10,
    learning_method='batch',
    max_iter=25,
    random_state=0
)

document_topics = lda.fit_transform(X)

In [None]:
lda.components_.shape

In [None]:
sorting = np.argsort(lda.components_, axis=1)[:, ::-1]

feature_names = np.array(vect.get_feature_names())

mglearn.tools.print_topics(
    topics=range(10),
    feature_names=feature_names,
    sorting=sorting,
    topics_per_chunk=5,
    n_words=10
)

In [None]:
lda100 = LatentDirichletAllocation(
    n_topics=100,
    learning_method='batch',
    max_iter=25,
    random_state=0
)

document_topics100 = lda100.fit_transform(X)
topics = np.array([7, 16, 24, 25, 28, 36, 37, 45, 51, 53, 54, 63, 89, 97])

sorting = np.argsort(lda100.components_, axis=1)[:, ::-1]
feature_names = np.array(vect.get_feature_names())

mglearn.tools.print_topics(
    topics=topics,
    feature_names=feature_names,
    sorting=sorting,
    topics_per_chunk=7,
    n_words=20
)

In [None]:
music = np.argsort(document_topics100[:, 45])[::-1]

for i in music[:10]:
    print(b'.'.join(text_train[i].split(b'.')[:2]) + b".\n")

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10, 10))

topic_names = [
    f'{i}' + ' '.join(words)
    for i, words in enumerate(feature_names[sorting[:, :2]])
]

# две столбиковые диаграммы:
for col in [0, 1]:
    start = col * 50
    end = (col + 1) * 50
    
    ax[col].barh(np.arange(50), np.sum(document_topics100, axis=0)[start:end])
    ax[col].set_yticks(np.arange(50))
    ax[col].set_yticklabels(topic_names[start:end], ha='left', va='top')
    ax[col].invert_yaxis()
    ax[col].set_xlim(0, 2000)

    yax = ax[col].get_yaxis()
    yax.set_tick_params(pad=130)

plt.tight_layout()