### Read Data 

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import pandas as pd

train = pd.read_csv('../input/labeledTrainData.tsv', delimiter="\t")
test = pd.read_csv('../input/testData.tsv', delimiter="\t")

train.head()                

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [2]:
print (train.shape)
print (test.shape) # Test data does not have results

(25000, 3)
(25000, 2)


### Clean Data 

In [3]:
import re  

def review_to_wordlist(review):
    #Keep English words only. 
    review_text = re.sub("[^a-zA-Z]"," ", review)
    
    #Turn words into lowercase
    words = review_text.lower()
    
    return(words)

# Seperate indices and text

# Indices
y_train = train['sentiment']

# Text
train_data = []
for review in train['review']:
    train_data.append(review_to_wordlist(review))
    
# Convert to numpy array      
train_data = np.array(train_data)

# Same for testing data
test_data = []
for review in test['review']:
    test_data.append(review_to_wordlist(review))
    
test_data = np.array(test_data)

print(train_data.shape)
print(test_data.shape)

(25000,)
(25000,)


### Vectorize Data 
* [ngram_range](https://www.kaggle.com/c/avito-demand-prediction/discussion/58819)
* [fit_transform vs transform](https://datascience.stackexchange.com/questions/12321/difference-between-fit-and-fit-transform-in-scikit-learn-models)

In [4]:
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer

# counting
# vectorizer = CountVectorizer()
# data_train_count = vectorizer.fit_transform(train_data)
# data_test_count  = vectorizer.transform(test_data)

# tf-idf
tfidf = TfidfVectorizer(
           ngram_range=(1, 3),  
           use_idf=1,
           smooth_idf=1,
           stop_words = 'english') # Romove stop words


data_train_count = tfidf.fit_transform(train_data)
data_test_count  = tfidf.transform(test_data)

print("Let's go!")

Let's go!


### Multinomial Naive Bayes

In [5]:
from sklearn.naive_bayes import MultinomialNB 

clf = MultinomialNB()
clf.fit(data_train_count, y_train)
pred = clf.predict(data_test_count)
print (pred)

[1 0 1 ... 1 1 0]


### Save Results

In [6]:
df = pd.DataFrame({"id": test['id'],"sentiment": pred})

df.to_csv('submission.csv',index = False, header=True)

### Credit

Data Credit: [Kaggle](https://www.kaggle.com/c/word2vec-nlp-tutorial/)