In [1]:
import re
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import string
import nltk
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)
%matplotlib inline

In [3]:
data = pd.read_csv("word2vec-nlp-tutorial/unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
data.head()

Unnamed: 0,id,review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was..."
1,"""45057_0""","""I saw this film about 20 years ago and rememb..."
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B..."
3,"""7161_0""","""I went to see this film with a great deal of ..."
4,"""43971_0""","""Yes, I agree with everyone on this site this ..."


In [4]:
data['tidy_review'] = data['review'].str.replace("[^a-zA-Z]", " ")
data.head()

Unnamed: 0,id,review,tidy_review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was...",Watching Time Chasers it obvious that it was...
1,"""45057_0""","""I saw this film about 20 years ago and rememb...",I saw this film about years ago and rememb...
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B...",Minor Spoilers br br In New York Joan B...
3,"""7161_0""","""I went to see this film with a great deal of ...",I went to see this film with a great deal of ...
4,"""43971_0""","""Yes, I agree with everyone on this site this ...",Yes I agree with everyone on this site this ...


In [5]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
stop_words = set(stopwords.words('english')) 

data['tidy_review'] =  data['tidy_review'].apply(lambda x: ' '.join([w for w in word_tokenize(x) if w.lower() not in stop_words]))

data.head()

Unnamed: 0,id,review,tidy_review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was...",Watching Time Chasers obvious made bunch frien...
1,"""45057_0""","""I saw this film about 20 years ago and rememb...",saw film years ago remember particularly nasty...
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B...",Minor Spoilers br br New York Joan Barnard Elv...
3,"""7161_0""","""I went to see this film with a great deal of ...",went see film great deal excitement school dir...
4,"""43971_0""","""Yes, I agree with everyone on this site this ...",Yes agree everyone site movie bad even call mo...


In [6]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

tokenized_review = data['tidy_review'].apply(lambda x: x.split())

tokenized_review = tokenized_review.apply(lambda x: [stemmer.stem(i) for i in x])

for i in range(len(tokenized_review)):
    tokenized_review[i] = ' '.join(tokenized_review[i])

data['tidy_review'] = tokenized_review



In [7]:
data.head()

Unnamed: 0,id,review,tidy_review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was...",watch time chaser obviou made bunch friend may...
1,"""45057_0""","""I saw this film about 20 years ago and rememb...",saw film year ago rememb particularli nasti be...
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B...",minor spoiler br br new york joan barnard elvi...
3,"""7161_0""","""I went to see this film with a great deal of ...",went see film great deal excit school director...
4,"""43971_0""","""Yes, I agree with everyone on this site this ...",ye agre everyon site movi bad even call movi i...


In [21]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk import sent_tokenize, word_tokenize, pos_tag
 
 
lemmatizer = WordNetLemmatizer()
 
 
def penn_to_wn(tag):
    """
    Convert between the PennTreebank tags to simple Wordnet tags
    """
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None 

In [22]:
def swn_polarity(text):
    """
    Return a sentiment polarity: 0 = negative, 1 = positive
    """
 
    sentiment = 0.0
    positive_score = 0
    negative_score = 0
    tokens_count = 0
 
    raw_sentences = sent_tokenize(text)
    for raw_sentence in raw_sentences:
        tagged_sentence = pos_tag(word_tokenize(raw_sentence))
 
        for word, tag in tagged_sentence:
            wn_tag = penn_to_wn(tag)
            if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
                continue

            

            synsets = wn.synsets(lemma, pos=wn_tag)
            if not synsets:
                continue
 
            # Take the first sense, the most common
            synset = synsets[0]
            swn_synset = swn.senti_synset(synset.name())
 
            positive_score += swn_synset.pos_score()
            negative_score += swn_synset.neg_score()
            tokens_count += 1
 
    return [positive_score, negative_score, tokens_count]

In [49]:
data['info'] = data['tidy_review'].apply(lambda x: swn_polarity(x))

In [38]:
data['positive_score'] = data['info'].apply(lambda x: x[0])

In [39]:
data['negative_score'] = data['info'].apply(lambda x: x[1])

In [44]:
data['sentiment'] = data['info'].apply(lambda x: 1 if (x[0] - x[1])>=0 else 0)

In [50]:
data.head()

Unnamed: 0,id,review,tidy_review,positive_score,negative_score,sentiment,info
0,"""9999_0""","""Watching Time Chasers, it obvious that it was...",watch time chaser obviou made bunch friend may...,0.25,3.0,0,"[0.25, 3.0, 26]"
1,"""45057_0""","""I saw this film about 20 years ago and rememb...",saw film year ago rememb particularli nasti be...,0.5,1.75,0,"[0.5, 1.75, 24]"
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B...",minor spoiler br br new york joan barnard elvi...,3.0,4.5,0,"[3.0, 4.5, 66]"
3,"""7161_0""","""I went to see this film with a great deal of ...",went see film great deal excit school director...,1.75,1.5,1,"[1.75, 1.5, 55]"
4,"""43971_0""","""Yes, I agree with everyone on this site this ...",ye agre everyon site movi bad even call movi i...,2.25,1.875,1,"[2.25, 1.875, 49]"
