In [3]:

# coding: utf-8

# #Predicting sentiment from product reviews
# 
# #Fire up GraphLab Create

# In[1]:

import graphlab


# #Read some product review data
# 
# Loading reviews for a set of baby products. 

# In[2]:

products = graphlab.SFrame('amazon_baby.gl/')


# #Let's explore this data together
# 
# Data includes the product name, the review text and the rating of the review. 

# In[3]:

products.head()


# #Build the word count vector for each review

# In[4]:

products['word_count'] = graphlab.text_analytics.count_words(products['review'])


# In[5]:

products.head()


# In[6]:

graphlab.canvas.set_target('ipynb')


# In[7]:

products['name'].show()


# #Examining the reviews for most-sold product:  'Vulli Sophie the Giraffe Teether'

# In[8]:

giraffe_reviews = products[products['name'] == 'Vulli Sophie the Giraffe Teether']


# In[9]:

len(giraffe_reviews)


# In[10]:

giraffe_reviews['rating'].show(view='Categorical')


# #Build a sentiment classifier

# In[11]:

products['rating'].show(view='Categorical')


# ##Define what's a positive and a negative sentiment
# 
# We will ignore all reviews with rating = 3, since they tend to have a neutral sentiment.  Reviews with a rating of 4 or higher will be considered positive, while the ones with rating of 2 or lower will have a negative sentiment.   

# In[12]:

#ignore all 3* reviews
products = products[products['rating'] != 3]


# In[13]:

#positive sentiment = 4* or 5* reviews
products['sentiment'] = products['rating'] >=4


# In[14]:

products.head()


# ##Let's train the sentiment classifier

# In[15]:

train_data,test_data = products.random_split(.8, seed=0)


# In[16]:

sentiment_model = graphlab.logistic_classifier.create(train_data,
                                                     target='sentiment',
                                                     features=['word_count'],
                                                     validation_set=test_data)


# #Evaluate the sentiment model

# In[17]:

sentiment_model.evaluate(test_data, metric='roc_curve')


# In[18]:

sentiment_model.show(view='Evaluation')


# #Applying the learned model to understand sentiment for Giraffe

# In[19]:

giraffe_reviews['predicted_sentiment'] = sentiment_model.predict(giraffe_reviews, output_type='probability')


# In[20]:

giraffe_reviews.head()


# ##Sort the reviews based on the predicted sentiment and explore

# In[21]:

giraffe_reviews = giraffe_reviews.sort('predicted_sentiment', ascending=False)


# In[22]:

giraffe_reviews.head()


# ##Most positive reviews for the giraffe

# In[23]:

giraffe_reviews[0]['review']


# In[24]:

giraffe_reviews[1]['review']


# ##Show most negative reviews for giraffe

# In[25]:

giraffe_reviews[-1]['review']


# In[26]:

giraffe_reviews[-2]['review']


# ## Use .apply() to build a new feature with the counts for each of the selected_words
# 
# selected_words = ['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate']

# In[27]:

def awesome_count(x):
    if 'awesome' in x:
        return x.get('awesome')
    else:
        return 0


# In[28]:

products['awesome'] = products['word_count'].apply(awesome_count)


# In[29]:

def great_count(x):
    if 'great' in x:
        return x.get('great')
    else:
        return 0


# In[30]:

products['great'] = products['word_count'].apply(great_count)


# In[31]:

def fan_count(x):
    if 'fantastic' in x:
        return x.get('fantastic')
    else:
        return 0


# In[32]:

products['fantastic'] = products['word_count'].apply(fan_count)


# In[33]:

def ama_count(x):
    if 'amazing' in x:
        return x.get('amazing')
    else:
        return 0


# In[34]:

products['amazing'] = products['word_count'].apply(ama_count)


# In[35]:

def love_count(x):
    if 'love' in x:
        return x.get('love')
    else:
        return 0


# In[36]:

products['love'] = products['word_count'].apply(love_count)


# In[37]:

def horr_count(x):
    if 'horrible' in x:
        return x.get('horrible')
    else:
        return 0


# In[38]:

products['horrible'] = products['word_count'].apply(horr_count)


# In[39]:

def bad_count(x):
    if 'bad' in x:
        return x.get('bad')
    else:
        return 0


# In[40]:

products['bad'] = products['word_count'].apply(bad_count)


# In[41]:

def terr_count(x):
    if 'terrible' in x:
        return x.get('terrible')
    else:
        return 0


# In[42]:

products['terrible'] = products['word_count'].apply(terr_count)


# In[43]:

def awful_count(x):
    if 'awful' in x:
        return x.get('awful')
    else:
        return 0


# In[44]:

products['awful'] = products['word_count'].apply(awful_count)


# In[45]:

def wow_count(x):
    if 'wow' in x:
        return x.get('wow')
    else:
        return 0


# In[46]:

products['wow'] = products['word_count'].apply(wow_count)


# In[47]:

def hate_count(x):
    if 'hate' in x:
        return x.get('hate')
    else:
        return 0


# In[48]:

products['hate'] = products['word_count'].apply(hate_count)


# In[49]:

# most used word and least used word among the selected words
print (sum(products['awesome'])) 
print (sum(products['great']))
print (sum(products['fantastic']))
print (sum(products['amazing']))
print (sum(products['love'])) 
print (sum(products['horrible'])) 
print (sum(products['bad'])) 
print (sum(products['terrible'])) 
print (sum(products['awful'])) 
print (sum(products['wow'])) 
print (sum(products['hate'])) 


# ## Create a new sentiment analysis model using only the selected_words as features

# In[50]:

selected_words = ['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate']


# In[51]:

train_data,test_data = products.random_split(.8, seed=0)


# In[52]:

selected_words_model = graphlab.logistic_classifier.create(train_data,
                                                     target='sentiment',
                                                     features=selected_words,
                                                     validation_set=test_data)


# In[53]:

selected_words_model.evaluate(test_data, metric='roc_curve')


# In[54]:

selected_words_model.show(view='Evaluation')
sentiment_model.show(view='Evaluation')


# In[55]:

selected_words_model['coefficients'].sort('value',ascending = True)
selected_words_model['coefficients'].print_rows(num_rows=12, num_columns=5)


# In[56]:

diaper_champ_reviews = products[products['name'] == 'Baby Trend Diaper Champ']


# In[57]:

len(diaper_champ_reviews)


# In[67]:

diaper_champ_reviews['predicted_sentiment'] = sentiment_model.predict(diaper_champ_reviews, output_type='probability')


# In[68]:

diaper_champ_reviews['rating'].show(view='Categorical')


# In[69]:

diaper_champ_reviews = diaper_champ_reviews.sort('predicted_sentiment', ascending = False)


# In[76]:

diaper_champ_reviews.head()


# In[70]:

diaper_champ_reviews[0]['review']


# In[73]:

print (diaper_champ_reviews[0]['review'].split())


# In[74]:

selected_words_model.predict(diaper_champ_reviews[0:1], output_type='probability')


# In[75]:

diaper_champ_reviews['predicted_sentiment'].show()


# In[ ]:





IOError: Cannot open C:/Users/user/Documents/Python Scripts/amazon_baby.gl/objects.bin for read. Cannot open C:/Users/user/Documents/Python Scripts/amazon_baby.gl/objects.bin for reading