In [16]:
import nltk
from nltk.corpus import movie_reviews
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score


In [6]:
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')

negfeats = [movie_reviews.words(fileids=[f]) for f in negids]
posfeats = [movie_reviews.words(fileids=[f]) for f in posids]

df = pd.DataFrame({
    'words': negfeats+posfeats,
    'response': len(negfeats)*[0]+len(posfeats)*[1]
})

X = list(map(" ".join, df.words))
y = df.response



In [4]:
pipe = Pipeline(
    [
        ('vec', CountVectorizer()),
        ('clf', LogisticRegression())
    ]
)

scores = cross_val_score(pipe, X, y, cv=5)
print(f"avg={scores.mean()} std={scores.std()}")



avg=0.841 std=0.01677796173556255


In [6]:
pipe = Pipeline(
    [
        ('vec', TfidfVectorizer()),
        ('clf', LogisticRegression())
    ]
)

scores = cross_val_score(pipe, X, y, cv=5)
print(f"avg={scores.mean()} std={scores.std()}")



avg=0.8210000000000001 std=0.004062019202317978


In [7]:
with open('answ01.txt','w',encoding="utf8") as f:
    f.write("0.841 0.01677796173556255 0.8210000000000001 0.004062019202317978")

In [8]:
pipe1 = Pipeline(
    [
        ('vec', CountVectorizer(min_df=10)),
        ('clf', LogisticRegression())
    ]
)

pipe2 = Pipeline(
    [
        ('vec', CountVectorizer(min_df=50)),
        ('clf', LogisticRegression())
    ]
)

scores1 = cross_val_score(pipe1, X, y, cv=5)
scores2 = cross_val_score(pipe2, X, y, cv=5)
print(f"score[min_df=10]={scores1.mean()}\nscore[min_df=50]={scores2.mean()}")



score[min_df=10]=0.8390000000000001
score[min_df=50]=0.813


In [9]:
with open('answ02.txt','w',encoding="utf8") as f:
    f.write("0.8390000000000001 0.813")

In [12]:
pipe1 = Pipeline(
    [
        ('vec', CountVectorizer()),
        ('clf', LogisticRegression())
    ]
)

pipe2 = Pipeline(
    [
        ('vec', CountVectorizer()),
        ('clf', LinearSVC())
    ]
)

pipe3 = Pipeline(
    [
        ('vec', CountVectorizer()),
        ('clf', SGDClassifier())
    ]
)

scores1 = cross_val_score(pipe1, X, y, cv=5)
scores2 = cross_val_score(pipe2, X, y, cv=5)
scores3 = cross_val_score(pipe3, X, y, cv=5)
print(f"lg={scores1.mean()}\nsvc={scores2.mean()}\nsgd={scores3.mean()}")



lg=0.841
svc=0.8325000000000001
sgd=0.7515000000000001


In [14]:
with open('answ03.txt','w',encoding="utf8") as f:
    f.write("0.7515000000000001")

In [20]:
stopwords = nltk.corpus.stopwords.words('english')

In [22]:
pipe1 = Pipeline(
    [
        ('vec', CountVectorizer(stop_words=stopwords)),
        ('clf', LogisticRegression())
    ]
)

pipe2 = Pipeline(
    [
        ('vec', CountVectorizer(stop_words="english")),
        ('clf', LogisticRegression())
    ]
)

scores1 = cross_val_score(pipe1, X, y, cv=5)
scores2 = cross_val_score(pipe2, X, y, cv=5)
print(f"nltk={scores1.mean()} sklearn={scores2.mean()}")



nltk=0.841 sklearn=0.8385


In [23]:
with open('answ04.txt','w',encoding="utf8") as f:
    f.write("0.841 0.8385")

In [34]:
pipe1 = Pipeline(
    [
        ('vec', CountVectorizer(ngram_range=(1,2))),
        ('clf', LogisticRegression())
    ]
)

pipe2 = Pipeline(
    [
        ('vec', CountVectorizer(ngram_range=(3,5), analyzer='char_wb')),
        ('clf', LogisticRegression())
    ]
)

scores1 = cross_val_score(pipe1, X, y, cv=5)
scores2 = cross_val_score(pipe2, X, y, cv=5)
print(f"1-2-gram={scores1.mean()} 3-5-gram={scores2.mean()}")



1-2-gram=0.8525 3-5-gram=0.82


In [35]:
with open('answ05.txt','w',encoding="utf8") as f:
    f.write("0.8525 0.82")