In [1]:
#Import needed libraries and get data
%matplotlib inline

import pandas as pd
import numpy as np
import re
import nltk
import urllib
from matplotlib import pyplot as plt
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from nltk.tag.perceptron import PerceptronTagger

train = pd.read_table('train.tsv')
train["Length"] = train['Phrase'].apply(lambda x: len(x.split()))
ordered = train.sort(['SentenceId', 'Length'], ascending=[1, 0])
train_sentences = train.groupby('SentenceId').first().reset_index()


max_length = max(ordered['Length'])
bins = [0, 1, max_length/15, max_length/4, max_length/2, max_length]
group_names = ['SingleWord', 'SmallPhrase', 'Phrase', 'LongPhrase', 'Sentences']
categories = pd.cut(ordered['Length'], bins, labels=group_names)
ordered['categories'] = pd.cut(ordered['Length'], bins, labels=group_names)


sentences = ordered[ordered['categories'] == 'Sentences'].reset_index()
longphrase = ordered[ordered['categories'] == 'LongPhrase'].reset_index()
phrase = ordered[ordered['categories'] == 'Phrase'].reset_index()
smallphrase = ordered[ordered['categories'] == 'SmallPhrase'].reset_index()
singleword = ordered[ordered['categories'] == 'SingleWord'].reset_index()

tagger = PerceptronTagger()

singleword['POS'] = 'N/A'
print bins[::-1]
print len(sentences), len(longphrase), len(phrase), len(smallphrase), len(singleword)
ordered



[52, 26, 13, 3, 1, 0]
4193 20141 68215 46979 16531


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,Length,categories
0,1,1,A series of escapades demonstrating the adage ...,1,37,Sentences
27,28,1,"is also good for the gander , some of which oc...",2,23,LongPhrase
28,29,1,"is also good for the gander , some of which oc...",2,22,LongPhrase
31,32,1,"good for the gander , some of which occasional...",2,20,LongPhrase
32,33,1,"for the gander , some of which occasionally am...",2,19,LongPhrase
33,34,1,"the gander , some of which occasionally amuses...",1,18,LongPhrase
38,39,1,some of which occasionally amuses but none of ...,2,15,LongPhrase
1,2,1,A series of escapades demonstrating the adage ...,2,14,LongPhrase
5,6,1,of escapades demonstrating the adage that what...,2,12,Phrase
43,44,1,occasionally amuses but none of which amounts ...,2,12,Phrase


In [4]:
def review_to_words( raw_review ):
    letters_only = re.sub("[^a-zA-Z]", " ", raw_review) 
    word = letters_only.lower()
    #tagset = None
    #token = nltk.word_tokenize(word)
    #pos = nltk.tag._pos_tag(token, tagset, tagger)
    #return word, pos
    return word

In [5]:
num_reviews_single = singleword["Phrase"].size
clean_single_reviews = []

for i in xrange(0, num_reviews_single):
    cleaned = review_to_words(singleword['Phrase'][i])
    #cleaned, pos = review_to_words(singleword["Phrase"][i])
    #singleword['POS'][i] = str([x[1] for x in pos])
    clean_single_reviews.append(cleaned)
    print 'cleaning ', i, cleaned
print clean_single_reviews
singleword

cleaning  0 a
cleaning  1 series
cleaning  2 of
cleaning  3 escapades
cleaning  4 demonstrating
cleaning  5 the
cleaning  6 adage
cleaning  7 that
cleaning  8 what
cleaning  9 is
cleaning  10 good
cleaning  11 for
cleaning  12 goose
cleaning  13 also
cleaning  14 gander
cleaning  15  
cleaning  16 some
cleaning  17 which
cleaning  18 occasionally
cleaning  19 amuses
cleaning  20 but
cleaning  21 none
cleaning  22 amounts
cleaning  23 to
cleaning  24 much
cleaning  25 story
cleaning  26  
cleaning  27 this
cleaning  28 quiet
cleaning  29 introspective
cleaning  30 and
cleaning  31 entertaining
cleaning  32 independent
cleaning  33 worth
cleaning  34 seeking
cleaning  35 even
cleaning  36 fans
cleaning  37 ismail
cleaning  38 merchant
cleaning  39  s
cleaning  40 work
cleaning  41 i
cleaning  42 suspect
cleaning  43 would
cleaning  44 have
cleaning  45 hard
cleaning  46 time
cleaning  47 sitting
cleaning  48 through
cleaning  49 one
cleaning  50 positively
cleaning  51 thrilling
cleaning

Unnamed: 0,index,PhraseId,SentenceId,Phrase,Sentiment,Length,categories,POS
0,3,4,1,A,2,1,SingleWord,
1,4,5,1,series,2,1,SingleWord,
2,6,7,1,of,2,1,SingleWord,
3,8,9,1,escapades,2,1,SingleWord,
4,11,12,1,demonstrating,2,1,SingleWord,
5,13,14,1,the,2,1,SingleWord,
6,14,15,1,adage,2,1,SingleWord,
7,16,17,1,that,2,1,SingleWord,
8,18,19,1,what,2,1,SingleWord,
9,20,21,1,is,2,1,SingleWord,


In [30]:
#singleword.to_csv('singleword.csv', header=True, index=True)

In [29]:
singleword = pd.read_csv('singleword.csv')
singleword

Unnamed: 0.1,Unnamed: 0,index,PhraseId,SentenceId,Phrase,Sentiment,Length,categories,POS
0,0,3,4,1,A,2,1,SingleWord,['DT']
1,1,4,5,1,series,2,1,SingleWord,['NN']
2,2,6,7,1,of,2,1,SingleWord,['IN']
3,3,8,9,1,escapades,2,1,SingleWord,['NNS']
4,4,11,12,1,demonstrating,2,1,SingleWord,['VBG']
5,5,13,14,1,the,2,1,SingleWord,['DT']
6,6,14,15,1,adage,2,1,SingleWord,['NN']
7,7,16,17,1,that,2,1,SingleWord,['IN']
8,8,18,19,1,what,2,1,SingleWord,['WP']
9,9,20,21,1,is,2,1,SingleWord,['VBZ']


In [7]:
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 9000) 
single_data_features = vectorizer.fit_transform(clean_single_reviews)
single_data_features = single_data_features.toarray()

In [8]:
vocab = vectorizer.get_feature_names()
print len(vocab), vocab



In [8]:
# Initialize a Random Forest classifier with 20 trees
#forest_single = RandomForestClassifier(n_estimators = 20) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
#forest_single = forest_single.fit(single_data_features, singleword["Sentiment"])
print 'done'

done


In [10]:
import cPickle

#with open('forest.cpickle', 'wb') as f:
#    cPickle.dump(forest_single, f)

In [11]:
with open('forest.cpickle', 'rb') as f:
    forest_single = cPickle.load(f)

In [64]:
test = pd.read_table('test.tsv')
test["Length"] = test['Phrase'].apply(lambda x: len(x.split()))
ordered = test.sort(['SentenceId', 'Length'], ascending=[1, 0])

max_length = max(ordered['Length'])
bins = [0, 1, max_length/15, max_length/4, max_length/2, max_length]
group_names = ['SingleWord', 'SmallPhrase', 'Phrase', 'LongPhrase', 'Sentences']
categories = pd.cut(ordered['Length'], bins, labels=group_names)
ordered['categories'] = pd.cut(ordered['Length'], bins, labels=group_names)


sentences_test = ordered[ordered['categories'] == 'Sentences'].reset_index()
longphrase_test = ordered[ordered['categories'] == 'LongPhrase'].reset_index()
phrase_test = ordered[ordered['categories'] == 'Phrase'].reset_index()
smallphrase_test = ordered[ordered['categories'] == 'SmallPhrase'].reset_index()
stest = ordered[ordered['categories'] == 'SingleWord'].reset_index()
#singleword_test['POS'] = 'N/A'
print len(stest)
num_reviews_single = singleword_test["Phrase"].size
clean_single_reviews = []

10014


  app.launch_new_instance()


In [22]:
# for i in xrange(0, num_reviews_single):
#     cleaned, pos = review_to_words(singleword_test["Phrase"][i])
#     singleword_test['POS'][i] = str([x[1] for x in pos])
#     singleword_test['Phrase'][i] = cleaned
#     clean_single_reviews.append(cleaned)
#     print 'cleaning ', i, cleaned

#singleword_test.to_csv('singleword_test.csv', header=True, index=True)

In [48]:
singleword_test = pd.read_csv('singleword_test.csv')
singleword_test

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,index,PhraseId,SentenceId,Phrase,Length,categories,POS,Sentiment
0,0,0,0,2,156063,8545,an,1,SingleWord,['DT'],3
1,1,1,1,7,156068,8545,intermittently,1,SingleWord,['RB'],2
2,2,2,2,8,156069,8545,pleasing,1,SingleWord,['VBG'],2
3,3,3,3,9,156070,8545,but,1,SingleWord,['CC'],2
4,4,4,4,11,156072,8545,mostly,1,SingleWord,['RB'],3
5,5,5,5,12,156073,8545,routine,1,SingleWord,['NN'],2
6,6,6,6,13,156074,8545,effort,1,SingleWord,['NN'],2
7,7,7,7,14,156075,8545,,1,SingleWord,[],2
8,8,8,8,16,156077,8546,kidman,1,SingleWord,['NN'],2
9,9,9,9,20,156081,8546,is,1,SingleWord,['VBZ'],2


In [4]:
for i in xrange(0,singleword_test['Sentiment'].size):
    print singleword_test['Phrase'][i], singleword_test['Sentiment'][i]

an 3
intermittently 2
pleasing 2
but 2
mostly 3
routine 2
effort 2
  2
kidman 2
is 2
really 2
the 2
only 2
thing 2
that 2
 s 2
worth 3
watching 2
in 3
birthday 2
girl 2
  2
a 2
film 2
by 2
stage trained 1
jez 2
butterworth 2
 lrb  2
mojo 2
 rrb  2
serves 2
as 2
yet 2
another 2
example 1
of 2
sad 1
decline 2
british 1
comedies 3
post full 2
monty 2
world 2
once 2
you 2
get 2
into 2
its 2
rhythm 2
    2
movie 3
becomes 2
heady 2
experience 2
i 2
kept 2
wishing 3
was 3
documentary 2
about 2
wartime 2
navajos 3
and 1
what 2
they 2
accomplished 2
instead 2
all 3
this 1
specious 2
hollywood 2
hoo ha 1
kinnear 2
does 1
n t 2
aim 2
for 2
our 2
sympathy 2
rather 2
delivers 3
performance 2
striking 2
skill 2
depth 1
ends 2
well 3
sort 2
frenzied 2
comic 2
moments 2
never 3
click 3
it 3
hoot 2
half 2
great 2
way 3
american 2
people 2
to 3
see 4
candidate 2
like 2
when 2
he 2
not 2
giving 2
same 2
   cent 1
stump 3
speech 2
weight 3
piece 2
unerring 2
professionalism 1
chilly 1
production 2
fascin

In [15]:
print "Applying tree to single words...\n" 
single_data_features = vectorizer.fit_transform(clean_single_reviews)
single_data_features = single_data_features.toarray()
result_single = forest_single.predict(single_data_features)

Applying tree to single words...



ValueError: empty vocabulary; perhaps the documents only contain stop words

In [14]:
singleword_test['Sentiment'] = result_single.tolist()
singleword_test

NameError: name 'result_single' is not defined

In [17]:
print singleword.PhraseId.values 

[     4      5      7 ..., 156047 156059 156060]


In [74]:
def basic_clean( raw_review ):
    letters_only = re.sub("[^a-zA-Z]", " ", raw_review) 
    words = letters_only.lower().split()
    return words

test_sentences = ordered.groupby('SentenceId').first().reset_index()

#remaining = ordered[~ordered['PhraseId'].isin(test_sentences['PhraseId'])].reset_index()
#test_phrases = remaining[remaining.categories != 'SingleWord'].reset_index()

print len(remaining), len(test_phrases), len(ordered), len(test_sentences), max(ordered.SentenceId)
#test_phrases['Sentiment'] = '2'

num_reviews_phrases = test_phrases["Phrase"].size
clean_phrases_reviews = []

#for i in xrange(0, num_reviews_phrases):
#    print 'cleaning ', i, 'of ', num_reviews_phrases
#    test_phrases['Phrase'][i] = basic_clean(test_phrases['Phrase'][i])
test_sentences

13324 52968 66292 3310 11855


Unnamed: 0,SentenceId,PhraseId,Phrase,Length,categories
0,8545,156061,An intermittently pleasing but mostly routine ...,8,Phrase
1,8546,156076,Kidman is really the only thing that 's worth ...,43,Sentences
2,8547,156154,Once you get into its rhythm ... the movie bec...,14,Phrase
3,8548,156178,I kept wishing I was watching a documentary ab...,24,LongPhrase
4,8549,156219,"Kinnear does n't aim for our sympathy , but ra...",19,LongPhrase
5,8550,156250,"All ends well , sort of , but the frenzied com...",15,LongPhrase
6,8551,156272,"It 's a hoot and a half , and a great way for ...",34,Sentences
7,8552,156324,"The weight of the piece , the unerring profess...",26,LongPhrase
8,8553,156362,"The film contains no good jokes , no good scen...",28,LongPhrase
9,8554,156405,"An offbeat , sometimes gross and surprisingly ...",18,LongPhrase


In [62]:
#test_phrases.to_csv('test_phrases.csv', header=True, index=True)
#test_phrases = pd.read_csv('test_phrases.csv')
#test_phrases

Unnamed: 0.1,Unnamed: 0,level_0,index,PhraseId,SentenceId,Phrase,Length,categories,Sentiment
0,0,0,1,156062,8545,"['an', 'intermittently', 'pleasing', 'but', 'm...",7,Phrase,2
1,1,1,3,156064,8545,"['intermittently', 'pleasing', 'but', 'mostly'...",6,Phrase,2
2,2,2,4,156065,8545,"['intermittently', 'pleasing', 'but', 'mostly'...",5,Phrase,2
3,3,3,5,156066,8545,"['intermittently', 'pleasing', 'but']",3,SmallPhrase,2
4,4,4,6,156067,8545,"['intermittently', 'pleasing']",2,SmallPhrase,2
5,5,5,10,156071,8545,"['mostly', 'routine']",2,SmallPhrase,2
6,6,14,17,156078,8546,"['is', 'really', 'the', 'only', 'thing', 'that...",42,Sentences,2
7,7,15,18,156079,8546,"['is', 'really', 'the', 'only', 'thing', 'that...",41,Sentences,2
8,8,16,22,156083,8546,"['the', 'only', 'thing', 'that', 's', 'worth',...",39,Sentences,2
9,9,17,28,156089,8546,"['that', 's', 'worth', 'watching', 'in', 'birt...",36,Sentences,2


In [3]:
import csv
all_sentiments = []
phrases_size = test_phrases['Phrase'].size

negations = ['however','but','although','not','no','neither','never','noone','nobody','none','nor','nothing','nowhere','hardly','scarcely','isn','wasn','didn']

#test_phrases["Phrase"] = test_phrases['Phrase'].apply(lambda x: )

new_file = open('all_sentiments.txt', 'w')
test_phrases

for i in xrange(0,phrases_size):
    print 'index ',i,' of',phrases_size
    raw_phrase = test_phrases['Phrase'][i]
    phrase = basic_clean(raw_phrase)
    sentiments = []
    for j in phrase:
        word = singleword_test[singleword_test.Phrase == j]
        pos_str = str(word.POS).partition('[')[-1].rpartition(']')[0]
        try:
            sentiments.append([int(word.Sentiment),'NEGATE']) if j in negations else sentiments.append([int(word.Sentiment),pos_str])
        except:
            pass
    new_file.write("%s\n" % sentiments)
    all_sentiments.append(sentiments)
new_file.close()
all_sentiments

NameError: name 'test_phrases' is not defined

In [92]:
#test_phrases['SentPOS'] = all_sentiments
#test_phrases.to_csv('test_phrases.csv', header=True, index=True)
test_phrases = pd.read_csv('test_phrases.csv')
test_phrases

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,level_0,index,PhraseId,SentenceId,Phrase,Length,categories,Sentiment,SentPOS
0,0,0,0,0,1,156062,8545,"['an', 'intermittently', 'pleasing', 'but', 'm...",7,Phrase,2,"[[3, ""'DT'""], [2, ""'RB'""], [2, ""'VBG'""], [2, '..."
1,1,1,1,1,3,156064,8545,"['intermittently', 'pleasing', 'but', 'mostly'...",6,Phrase,2,"[[2, ""'RB'""], [2, ""'VBG'""], [2, 'NEGATE'], [3,..."
2,2,2,2,2,4,156065,8545,"['intermittently', 'pleasing', 'but', 'mostly'...",5,Phrase,2,"[[2, ""'RB'""], [2, ""'VBG'""], [2, 'NEGATE'], [3,..."
3,3,3,3,3,5,156066,8545,"['intermittently', 'pleasing', 'but']",3,SmallPhrase,2,"[[2, ""'RB'""], [2, ""'VBG'""], [2, 'NEGATE']]"
4,4,4,4,4,6,156067,8545,"['intermittently', 'pleasing']",2,SmallPhrase,2,"[[2, ""'RB'""], [2, ""'VBG'""]]"
5,5,5,5,5,10,156071,8545,"['mostly', 'routine']",2,SmallPhrase,2,"[[3, ""'RB'""], [2, ""'NN'""]]"
6,6,6,6,14,17,156078,8546,"['is', 'really', 'the', 'only', 'thing', 'that...",42,Sentences,2,"[[2, ""'VBZ'""], [2, ""'RB'""], [2, ""'DT'""], [2, ""..."
7,7,7,7,15,18,156079,8546,"['is', 'really', 'the', 'only', 'thing', 'that...",41,Sentences,2,"[[2, ""'VBZ'""], [2, ""'RB'""], [2, ""'DT'""], [2, ""..."
8,8,8,8,16,22,156083,8546,"['the', 'only', 'thing', 'that', 's', 'worth',...",39,Sentences,2,"[[2, ""'DT'""], [2, ""'RB'""], [2, ""'NN'""], [2, ""'..."
9,9,9,9,17,28,156089,8546,"['that', 's', 'worth', 'watching', 'in', 'birt...",36,Sentences,2,"[[2, ""'IN'""], [2, ""'NN'""], [3, ""'NN'""], [2, ""'..."


In [19]:
import re, string

test_phrases['SentPOS'].tolist()
pattern = re.compile('[\W_]+')
length = test_phrases['SentPOS'].size

predicted_sent = []

for i in xrange(48000,length):
    print 'index: ',i
    raw_order = pattern.sub(' ', test_phrases['SentPOS'][i])
    order = raw_order.replace('\[\[\]\]', '').split()
    str_to_int = [int(i) if i.isdigit() else i for i in order]
    test = [tuple(str_to_int[i:i+2]) for i in range(0, len(str_to_int),2)]
    overall_sent = 0
    reverse = {0:4,1:3,2:2,3:1,4:0}
    rev = False
    count = 0
    pos_words = 0
    neg_words = 0
    affected = ['RB','RBR','RBS','JJS','JJR','JJ']
    for pair in test:
        try:
            init_val = pair[0]
            if init_val == 2:
                continue
            else:
                count += 1
                #print len(pair),init_val,'--------------'
                if pair[1] == 'NEGATE':
                    print 'reversing'
                    val = reverse[pair[0]]
                    rev = not rev
                else:
                    if rev:
                        if pair[1] in affected:
                            print 'affected'
                            val = reverse[init_val]
                    else:
                        val = init_val
                    if val == 3 or val == 4:
                        pos_words += 1
                    else: 
                        neg_words += 1
                overall_sent += val
        except: 
            continue
    print count, pos_words, neg_words
    if ((len(test) > 2 and count > 2) or (len(test) < 3 and count > 0)) and pos_words != neg_words:
        if neg_words == 0:
            sentiment = 4
        elif pos_words == 0:
            sentiment == 0
        else:
            sentiment = int(round(overall_sent/float(count)))
    else:
        sentiment = 2
    test_phrases['Sentiment'][i] = sentiment
    print 'sentiment: ',test_phrases['Sentiment'][i], ' id: ',test_phrases['PhraseId'][i]
    #print sentiment
    predicted_sent.append(sentiment)
test_phrases
                    
            

index:  48000
2 2 0
sentiment:  2  id:  156130
index:  48001
2 2 0
sentiment:  2  id:  156130
index:  48002
2 2 0
sentiment:  2  id:  156130
index:  48003
2 2 0
sentiment:  2  id:  156128
index:  48004
1 1 0
sentiment:  2  id:  156091
index:  48005
1 1 0
sentiment:  2  id:  156083
index:  48006
1 1 0
sentiment:  2  id:  156078
index:  48007
1 0 1
sentiment:  2  id:  156067
index:  48008
1 0 1
sentiment:  2  id:  156065
index:  48009
0 0 0
sentiment:  2  id:  156065
index:  48010
9 5 4
sentiment:  2  id:  156084
index:  48011
9 5 4
sentiment:  2  id:  156084
index:  48012
9 5 4
sentiment:  2  id:  156100
index:  48013
9 5 4
sentiment:  2  id:  156114
index:  48014
8 4 4
sentiment:  2  id:  156132
index:  48015
7 3 4
sentiment:  2  id:  156130
index:  48016
4 1 3
sentiment:  2  id:  156083
index:  48017
3 1 2
sentiment:  2  id:  156078
index:  48018
2 1 1
sentiment:  2  id:  156067
index:  48019
0 0 0
sentiment:  2  id:  156062
index:  48020
0 0 0
sentiment:  2  id:  156062
index:  48021

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,level_0,index,PhraseId,SentenceId,Phrase,Length,categories,Sentiment,SentPOS
0,0,0,0,1,156062,8545,"['an', 'intermittently', 'pleasing', 'but', 'm...",7,Phrase,2,"[[3, ""'DT'""], [2, ""'RB'""], [2, ""'VBG'""], [2, '..."
1,1,1,1,3,156064,8545,"['intermittently', 'pleasing', 'but', 'mostly'...",6,Phrase,2,"[[2, ""'RB'""], [2, ""'VBG'""], [2, 'NEGATE'], [3,..."
2,2,2,2,4,156065,8545,"['intermittently', 'pleasing', 'but', 'mostly'...",5,Phrase,2,"[[2, ""'RB'""], [2, ""'VBG'""], [2, 'NEGATE'], [3,..."
3,3,3,3,5,156066,8545,"['intermittently', 'pleasing', 'but']",3,SmallPhrase,2,"[[2, ""'RB'""], [2, ""'VBG'""], [2, 'NEGATE']]"
4,4,4,4,6,156067,8545,"['intermittently', 'pleasing']",2,SmallPhrase,2,"[[2, ""'RB'""], [2, ""'VBG'""]]"
5,5,5,5,10,156071,8545,"['mostly', 'routine']",2,SmallPhrase,2,"[[3, ""'RB'""], [2, ""'NN'""]]"
6,6,6,14,17,156078,8546,"['is', 'really', 'the', 'only', 'thing', 'that...",42,Sentences,2,"[[2, ""'VBZ'""], [2, ""'RB'""], [2, ""'DT'""], [2, ""..."
7,7,7,15,18,156079,8546,"['is', 'really', 'the', 'only', 'thing', 'that...",41,Sentences,2,"[[2, ""'VBZ'""], [2, ""'RB'""], [2, ""'DT'""], [2, ""..."
8,8,8,16,22,156083,8546,"['the', 'only', 'thing', 'that', 's', 'worth',...",39,Sentences,2,"[[2, ""'DT'""], [2, ""'RB'""], [2, ""'NN'""], [2, ""'..."
9,9,9,17,28,156089,8546,"['that', 's', 'worth', 'watching', 'in', 'birt...",36,Sentences,2,"[[2, ""'IN'""], [2, ""'NN'""], [3, ""'NN'""], [2, ""'..."


In [24]:
#test_phrases.to_csv('test_phrases.csv', header=True, index=True)

In [None]:
test_sentences['Sentiment'] = 2
sentences_amt = test_sentences['Sentiment'].size
negations = ['than','however','but','although','not','no','neither','never','noone','nobody','none','nor','nothing','nowhere','hardly','scarcely','isn','wasn','didn']
reverse = {0:4,1:3,2:2,3:1,4:0}
sentences_amt2 = test_phrases['Sentiment'].size

for i in xrange(0,sentences_amt2):
    sent_id = test_phrases['SentenceId'][i]
    sent = test_phrases['Sentiment'][i]
    sentiment = 0
    #phrase = basic_clean(test_sentences['Phrase'][i])
    phrase = test_phrases['Phrase'][i]
    rev = False
    neg_count = 0
    pos_count = 0
    count = 0
    for word in phrase:
        try:
            sw = singleword[singleword.Phrase == word]
            word_sent = int(sw.Sentiment)
            if word in negations:
                rev = not rev
                word_sent = reverse[word_sent]
            if word_sent == 0 or word_sent == 1:
                neg_count +=1
                count +=1
            elif word_sent == 3 or word_sent == 4:
                pos_count +=1
                count+=1
            else:
                continue
            sentiment += int(word_sent)
        except:
            continue
    if count > 2 and neg_count != pos_count:
        if pos_count > neg_count:
            if neg_count == 0:
                sent = 4
            else:
                sent = 3
        elif neg_count > pos_count:
            if pos_count == 0:
                sent = 0
            else:
                sent = 1
        else:
            sent = int(round(sentiment/len(count)))
    test_phrases['Sentiment'][i] = sent
    print 'index: ',test_phrases['PhraseId'][i],sent

index:  156062 2
index:  156064 2
index:  156065 2
index:  156066 2
index:  156067 2
index:  156071 2
index:  156078 2
index:  156079 2
index:  156083 2
index:  156089 2
index:  156091 2
index:  156092 2
index:  156093 2
index:  156097 2
index:  156128 2
index:  156111 2
index:  156130 2
index:  156113 2
index:  156132 2
index:  156098 2
index:  156114 2
index:  156136 2
index:  156100 2
index:  156138 2
index:  156102 2
index:  156147 2
index:  156115 2
index:  156148 2
index:  156084 2
index:  156103 2
index:  156116 2
index:  156121 2
index:  156139 3
index:  156143 2
index:  156149 2
index:  156080 2
index:  156086 2
index:  156094 2
index:  156104 1
index:  156108 2
index:  156118 2
index:  156123 2
index:  156126 3
index:  156133 2
index:  156140 2
index:  156144 2
index:  156151 3
index:  156166 2
index:  156168 3
index:  156155 2
index:  156157 2
index:  156171 2
index:  156159 3
index:  156172 2
index:  156161 3
index:  156174 2
index:  156163 3
index:  156169 2
index:  156175

In [97]:
len(test_sentences[test_sentences.Sentiment != 2])
print len(test_sentences),len(singleword_test),len(test_phrases),len(test)

3310 10014 52968 66292


In [94]:
#phrase = 
df = pd.merge(test_sentences, test_phrases, how='outer')
remaining = test[~test['PhraseId'].isin(df['PhraseId'])].reset_index()
remaining['Sentiment'] = 2
df1 = pd.merge(df, remaining, how='outer')
df2 = df1.sort(['PhraseId']).reset_index(drop=True)
df2



Unnamed: 0.2,SentenceId,PhraseId,Phrase,Length,categories,Sentiment,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,level_0,index,SentPOS
0,8545,156061,An intermittently pleasing but mostly routine ...,8,Phrase,2,,,,,,
1,8545,156062,"['an', 'intermittently', 'pleasing', 'but', 'm...",7,Phrase,2,0,0,0,0,1,"[[3, ""'DT'""], [2, ""'RB'""], [2, ""'VBG'""], [2, '..."
2,8545,156063,An,1,,2,,,,,2,
3,8545,156064,"['intermittently', 'pleasing', 'but', 'mostly'...",6,Phrase,2,1,1,1,1,3,"[[2, ""'RB'""], [2, ""'VBG'""], [2, 'NEGATE'], [3,..."
4,8545,156065,"['intermittently', 'pleasing', 'but', 'mostly'...",5,Phrase,2,2,2,2,2,4,"[[2, ""'RB'""], [2, ""'VBG'""], [2, 'NEGATE'], [3,..."
5,8545,156066,"['intermittently', 'pleasing', 'but']",3,SmallPhrase,2,3,3,3,3,5,"[[2, ""'RB'""], [2, ""'VBG'""], [2, 'NEGATE']]"
6,8545,156067,"['intermittently', 'pleasing']",2,SmallPhrase,2,4,4,4,4,6,"[[2, ""'RB'""], [2, ""'VBG'""]]"
7,8545,156068,intermittently,1,,2,,,,,7,
8,8545,156069,pleasing,1,,2,,,,,8,
9,8545,156070,but,1,,2,,,,,9,


In [95]:
df2["PhraseId"] = df2["PhraseId"].apply(lambda x : int(x))
df2["Sentiment"] = df2["Sentiment"].apply(lambda x : int(x))
print len(df2)

66292


In [96]:
output = pd.DataFrame(data={"PhraseId":df2["PhraseId"], "Sentiment":df2['Sentiment']})

# Use pandas to write the comma-separated output file
output.to_csv( "rotten_tomatoes_model.csv", index=False, quoting=3 )