In [97]:
import nltk
nltk.download('movie_reviews')
nltk.download('stopwords')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\Батыр\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Батыр\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [98]:
import numpy as np
from nltk.corpus import movie_reviews
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score

ans = []

In [99]:
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')


In [100]:
negfeats = [' '.join(movie_reviews.words(fileids=[f])) for f in negids]
posfeats = [' '.join(movie_reviews.words(fileids=[f])) for f in posids]


In [101]:
data = negfeats + posfeats
labels = np.array([0] * len(negfeats) + [1] * len(posfeats))

In [102]:
pipe_count = Pipeline([('vectorizer', CountVectorizer()), ('lr', LogisticRegression(max_iter=1000))])
pipe_tfidf = Pipeline([('vectorizer', TfidfVectorizer()),
                ('lr', LogisticRegression(max_iter=1000))])

cross_score_count = cross_val_score(
    pipe_count, data, labels, scoring='accuracy', n_jobs=-1, cv=5)
cross_score_tfidf = cross_val_score(
    pipe_tfidf, data, labels, scoring='accuracy', n_jobs=-1, cv=5)

ans.append(' '.join(tuple(map(str, (cross_score_count.mean(), cross_score_count.std(),
           cross_score_tfidf.mean(), cross_score_tfidf.std())))))


In [103]:
pipe_count_10 = Pipeline([('vectorizer', CountVectorizer(min_df=10)),
                      ('lr', LogisticRegression(max_iter=1000))])
pipe_count_50 = Pipeline([('vectorizer', CountVectorizer(min_df=50)),
                          ('lr', LogisticRegression(max_iter=1000))])

cross_score_count_10 = cross_val_score(
    pipe_count_10, data, labels, scoring='accuracy', n_jobs=-1, cv=5)
cross_score_count_50 = cross_val_score(
    pipe_count_50, data, labels, scoring='accuracy', n_jobs=-1, cv=5)

ans.append(' '.join(tuple(map(str, (cross_score_count_10.mean(),
           cross_score_count_50.mean())))))

In [104]:
pipe_count_lr = Pipeline([('vectorizer', CountVectorizer(min_df=10)),
                          ('lr', LogisticRegression(max_iter=1000))])
pipe_count_svc = Pipeline([('vectorizer', CountVectorizer(min_df=50)),
                          ('svc', LinearSVC(max_iter=1000))])
pipe_count_sgd = Pipeline([('vectorizer', CountVectorizer(min_df=50)),
                          ('sgd', SGDClassifier(max_iter=1000))])

cross_score_count_lr = cross_val_score(
    pipe_count_lr, data, labels, scoring='accuracy', n_jobs=-1, cv=5)
cross_score_count_svc = cross_val_score(
    pipe_count_svc, data, labels, scoring='accuracy', n_jobs=-1, cv=5)
cross_score_count_sgd = cross_val_score(
    pipe_count_sgd, data, labels, scoring='accuracy', n_jobs=-1, cv=5)

ans.append(min(cross_score_count_lr.mean(),
           cross_score_count_svc.mean(), cross_score_count_sgd.mean()))

In [105]:
stop_words = nltk.corpus.stopwords.words('english')
print(stop_words[:10] + stop_words[-10:-1])


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn']


In [106]:
pipe_count_nltk = Pipeline([('vectorizer', CountVectorizer(stop_words=stop_words, min_df=10)),
                          ('lr', LogisticRegression(max_iter=1000))])
pipe_count_sklearn = Pipeline([('vectorizer', CountVectorizer(stop_words='english', min_df=10)),
                            ('lr', LogisticRegression(max_iter=1000))])

cross_score_nltk = cross_val_score(
    pipe_count_nltk, data, labels, scoring='accuracy', n_jobs=-1, cv=5)
cross_score_sklearn = cross_val_score(
    pipe_count_sklearn, data, labels, scoring='accuracy', n_jobs=-1, cv=5)

ans.append(' '.join(tuple(map(str, (cross_score_nltk.mean(), cross_score_sklearn.mean())))))

In [107]:
pipe_count_bigrams = Pipeline([('vectorizer', CountVectorizer(ngram_range=(1, 2), min_df=10)),
                            ('lr', LogisticRegression(max_iter=1000))])
pipe_count_chgrams = Pipeline([('vectorizer', CountVectorizer(ngram_range=(3, 5), min_df=10, analyzer='char_wb')),
                               ('lr', LogisticRegression(max_iter=1000))])

cross_score_bigrams = cross_val_score(
    pipe_count_bigrams, data, labels, scoring='accuracy', n_jobs=-1, cv=5)
cross_score_chgrams = cross_val_score(
    pipe_count_chgrams, data, labels, scoring='accuracy', n_jobs=-1, cv=5)

ans.append(' '.join(
    tuple(map(str, (cross_score_bigrams.mean(), cross_score_chgrams.mean())))))


In [109]:
for i in range(5):
    with open('ans' + str(i) + '.txt', 'w') as f:
        f.write(str(ans[i]))