In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import movie_reviews

nltk.download("movie_reviews")

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\BobchenkovAV\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [5]:
#Q1 Список всех имеющихся отзывов
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')

neg_texts = [" ".join(movie_reviews.words(fileids=[f])) for f in negids]
pos_texts = [" ".join(movie_reviews.words(fileids=[f])) for f in posids]

texts = neg_texts + pos_texts
labels = [0] * len(neg_texts) + [1] * len(pos_texts)

len(texts)

2000

In [7]:
#Q2 доля класса 1 в выборке

len(pos_texts)/len(texts)

0.5

In [8]:
#Q3 Работа с CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit_transform(texts)
#39659

<2000x39659 sparse matrix of type '<class 'numpy.int64'>'
	with 666842 stored elements in Compressed Sparse Row format>

In [9]:
#all_words = []
#for w in movie_reviews.words():
#    all_words.append(w.lower())
    
#all_words = nltk.FreqDist(all_words)
#print(all_words.most_common(15))

[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), ("'", 30585), ('is', 25195), ('in', 21822), ('s', 18513), ('"', 17612), ('it', 16107), ('that', 15924), ('-', 15595)]


In [11]:
#Q4 Пайплайн CountVectorizer -> LogisticRegression
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

vectorizer = CountVectorizer()
classifier = LogisticRegression()

pipeline = Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
cross_val_score(pipeline, texts, labels).mean()




0.8360216503929078

In [13]:
#Q5 оценить по ROC-AUC
cross_val_score(pipeline, texts, labels, scoring="roc_auc").mean()



0.9107764937833774

In [15]:
#Q6 - самые важные слова
pipeline.fit(texts, labels)



{'memory': None,
 'steps': [('vectorizer',
   CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                   dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                   lowercase=True, max_df=1.0, max_features=None, min_df=1,
                   ngram_range=(1, 1), preprocessor=None, stop_words=None,
                   strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                   tokenizer=None, vocabulary=None)),
  ('classifier',
   LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                      intercept_scaling=1, l1_ratio=None, max_iter=100,
                      multi_class='warn', n_jobs=None, penalty='l2',
                      random_state=None, solver='warn', tol=0.0001, verbose=0,
                      warm_start=False))],
 'verbose': False,
 'vectorizer': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                 dtype=<class 'numpy.int64'>, encoding='ut

In [17]:
#Неделя 2
#Q1
from sklearn.feature_extraction.text import TfidfVectorizer

pipeline_a = Pipeline([("vectorizer", CountVectorizer()), ("classifier", LogisticRegression())])
pipeline_b = Pipeline([("vectorizer", TfidfVectorizer()), ("classifier", LogisticRegression())])

cvs_a = cross_val_score(pipeline_a, texts, labels, cv=5)
cvs_b = cross_val_score(pipeline_b, texts, labels, cv=5)

cvs_a.mean(), cvs_a.std(), cvs_b.mean(), cvs_b.std()



(0.841, 0.01677796173556255, 0.8210000000000001, 0.004062019202317978)

In [19]:
#Q2
pipeline_2a = Pipeline([("vectorizer", CountVectorizer(min_df=10)), ("classifier", LogisticRegression())])
pipeline_2b = Pipeline([("vectorizer", CountVectorizer(min_df=50)), ("classifier", LogisticRegression())])

cross_val_score(pipeline_2a, texts, labels, cv=5).mean(), cross_val_score(pipeline_2b, texts, labels, cv=5).mean()



(0.8390000000000001, 0.813)

In [25]:
#Q3 худшее качество - хз какой random_state выбрать....
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier

pipeline_3a = Pipeline([("vectorizer", CountVectorizer()), ("classifier", LogisticRegression())])
pipeline_3b = Pipeline([("vectorizer", CountVectorizer()), ("classifier", LinearSVC())])
pipeline_3c = Pipeline([("vectorizer", CountVectorizer()), ("classifier", SGDClassifier(random_state=42))])

cross_val_score(pipeline_3a, texts, labels, cv=5).mean(), cross_val_score(pipeline_3b, texts, labels, cv=5).mean(), cross_val_score(pipeline_3c, texts, labels, cv=5).mean()



(0.841, 0.8325000000000001, 0.8350000000000002)

In [27]:
#Q4
nltk.download("stopwords")
nltk_stop_words = nltk.corpus.stopwords.words('english')

pipeline_4a = Pipeline([("vectorizer", CountVectorizer(stop_words=nltk_stop_words)), ("classifier", LogisticRegression())])
pipeline_4b = Pipeline([("vectorizer", CountVectorizer(stop_words="english")), ("classifier", LogisticRegression())])

cross_val_score(pipeline_4a, texts, labels, cv=5).mean(), cross_val_score(pipeline_4b, texts, labels, cv=5).mean()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\BobchenkovAV\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


(0.8414999999999999, 0.8385)

In [28]:
#Q5 n-граммы
pipeline_5a = Pipeline([("vectorizer", CountVectorizer(ngram_range=(1,2))), ("classifier", LogisticRegression())])
pipeline_5b = Pipeline([("vectorizer", CountVectorizer(ngram_range=(3,5), analyzer="char_wb")), ("classifier", LogisticRegression())])

cross_val_score(pipeline_5a, texts, labels, cv=5).mean(), cross_val_score(pipeline_5b, texts, labels, cv=5).mean()



(0.8525, 0.82)