**Bag of Words**

In [None]:
import pandas as pd
train = pd.read_csv("../input/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

In [None]:
train.shape

In [None]:
train.columns.values

In [None]:
print(train["review"][0])

In [None]:
from bs4 import BeautifulSoup

In [None]:
example1 = BeautifulSoup(train["review"][0])

In [None]:
print(train["review"][0])

In [None]:
print(example1.get_text())

In [None]:
import re
letters_only = re.sub("[^a-zA-Z]", " ", example1.get_text() )
print(letters_only)

In [None]:
lower_case = letters_only.lower()
words = lower_case.split()

In [None]:
import nltk

In [None]:
from nltk.corpus import stopwords
print(stopwords.words("english"))

In [None]:
words = [w for w in words if not w in stopwords.words("english")]
print(words)

In [None]:
def review_to_words(raw_review):
    review_text = BeautifulSoup(raw_review).get_text()
    
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    
    words = letters_only.lower().split()
    
    stops = set(stopwords.words("english"))
    
    meaningful_words = [w for w in words if not w in stops]
    
    return(" ".join(meaningful_words))

In [None]:
clean_review = review_to_words(train["review"][0])
print(clean_review)

In [None]:
num_reviews = train["review"].size

clean_train_reviews = []

for i in range(0, num_reviews):
    clean_train_reviews.append(review_to_words(train["review"][i]))

In [None]:
print("Cleaning and parsing the training set movie reviews...\n")
clean_train_reviews = []
for i in range(0, num_reviews):
    if((i+1)%1000 == 0):
        print("Review %d of %d \n" % (i+1, num_reviews))
    clean_train_reviews.append(review_to_words(train["review"][i]))

In [None]:
print("Creating the bag of words...\n")
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000)

train_data_features = vectorizer.fit_transform(clean_train_reviews)

train_data_features = train_data_features.toarray()

In [None]:
print(train_data_features.shape)

In [None]:
vocab = vectorizer.get_feature_names()
print(vocab)

In [None]:
import numpy as np
dist = np.sum(train_data_features, axis=0)

for tag, count in zip(vocab, dist):
    print(count, tag)

In [None]:
print("Training the random forest...")
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators = 100)

forest = forest.fit(train_data_features, train["sentiment"])

In [None]:
test = pd.read_csv("../input/testData.tsv", header=0, delimiter='\t', quoting=3)

print(test.shape)

num_reviews = len(test["review"])
clean_test_reviews = []

print("Cleaning and parsing the test set movie reviews...\n")
for i in range(0, num_reviews):
    if((i+1)%1000 == 0):
        print("Review %d of %d\n" % (i+1, num_reviews))
    clean_review = review_to_words(test["review"][i])
    clean_test_reviews.append(clean_review)

test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

result = forest.predict(test_data_features)

output = pd.DataFrame(data = {"id":test["id"], "sentiment":result})

output.to_csv("Bag_of_Words_model.csv", index=False, quoting=3)

**Introducing Distributed Word Vectors**

In [None]:
import pandas as pd

train = pd.read_csv("../input/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
test = pd.read_csv("../input/testData.tsv", header=0, delimiter="\t", quoting=3)
unlabeled_train = pd.read_csv("../input/unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

print("Read %d labeled train reviews, %d labeled test reviews and %d unlabeled reviews\n" 
     % (train["review"].size, test["review"].size, unlabeled_train["review"].size))

In [None]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

def review_to_wordlist(review, remove_stopwords=False):
    review_text = BeautifulSoup(review).get_text()
    review_text = re.sub("[^a-zA-Z]", " ", review_text)
    words = review_text.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return(words)

In [None]:
import nltk.data

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def review_to_sentences(review, tokenizer, remove_stopwords=False):
    
    raw_sentences = tokenizer.tokenize(review.strip())
    
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))
    return sentences

In [None]:
sentences = []
print("Parsing sentences from training set")
for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer)

print("Parsing sentence from unlabeled set")
for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review, tokenizer)

In [None]:
print(len(sentences))

In [None]:
print(sentences[0])

print(sentences[1])

In [None]:
import gensim, logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

In [None]:
num_features = 300
min_word_count = 40
num_workers = 4
context = 10
downsampling = 1e-3

print("Training model...")
model = model = gensim.models.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

model.init_sims(replace=True)

model_name = "300features_40minwords_10context"

In [None]:
model.save(model_name)

In [None]:
model.doesnt_match("man woman child kitchen".split())

In [None]:
model.doesnt_match("france england germany berlin".split())

In [None]:
model.doesnt_match("paris berlin london austria".split())

In [None]:
model.most_similar("man")

In [None]:
model.most_similar("queen")

In [None]:
model.most_similar("awful")

**Numeric Representations of Words**

In [None]:
from gensim.models import Word2Vec
model = Word2Vec.load("300features_40minwords_10context")

In [None]:
type(model.cum_table)

In [None]:
model.cum_table.shape

In [None]:
model["flower"]