In [None]:
import numpy as np
import pandas as pd
import random
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import os


In [None]:
print(os.listdir("../input"))

In [None]:
stop = stopwords.words("english")
for char in [',','.',"'",'"','-', '(',')',':','?','/','>','<',"''", 'br', '\\','...']:
    stop.append(char)

In [None]:
train = pd.read_csv('../input/labeledTrainData.tsv', delimiter='\t')
test = pd.read_csv('../input/testData.tsv', delimiter='\t')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
len(train)

In [None]:
len(nltk.word_tokenize(train['review'][0]))

In [None]:
review_sentiment = []
for id_ in train.index:
    review_sentiment.append([nltk.word_tokenize(train['review'][id_]), train['sentiment'][id_]])

In [None]:
review_sentiment = review_sentiment[:5000]

In [None]:
random.shuffle(review_sentiment)

In [None]:
bag_of_words = []
for (review,sentiment) in review_sentiment:
    for word in review:
        if word not in stop:
            bag_of_words.append(word)

In [None]:
len(bag_of_words)

In [None]:
word_FD = nltk.FreqDist(bag_of_words)

In [None]:
word_FD.most_common(10)

In [None]:
len(word_FD)

In [None]:
word_FD_cut = list(word_FD.keys())[:10000]

In [None]:
featuresets = []
for (review, sentiment) in review_sentiment:
    words = set(word for word in review \
                 if word not in stop)
    features= {}
    for w in word_FD_cut:
        features[w] =  w in words
    featuresets.append([features, sentiment])

In [None]:
len(featuresets)

In [None]:
train = featuresets[:4000]
valid = featuresets[4000:]

In [None]:
clf = nltk.NaiveBayesClassifier.train(train)

In [None]:
nltk.classify.accuracy(clf, valid)

In [None]:
clf.show_most_informative_features(10)

In [None]:
test.head(1)

In [None]:
test_reviews = []
for id_ in test.index:
    test_reviews.append(nltk.word_tokenize(test['review'][id_]))

In [None]:
len(test_reviews)

In [None]:
test_sets = []
for review in test_reviews:
    words = set(word for word in review \
                 if word not in stop)
    features= {}
    for w in word_FD_cut:
        features[w] =  w in words
    test_sets.append(features)

In [None]:
labels = clf.classify_many(test_sets)

In [None]:
len(labels)

In [None]:
submission = pd.DataFrame({"id": test['id'], "sentiment":labels})
submission.to_csv('submission.csv', index=False)