In [29]:
# Getting Started

# import what we need
import pandas as pd
from pandas import DataFrame as DF, Series

import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

from textblob import TextBlob

In [30]:
# read data
cols = ['airline_sentiment','airline_sentiment_confidence',
        'airline','name','text']
data = pd.read_csv('tweets.csv', usecols=cols)

# Below is the first 5 rows of our data. We will only be using the first two features, and the last feature.

In [3]:
data.head()

Unnamed: 0,airline_sentiment,airline_sentiment_confidence,airline,name,text
0,neutral,1.0,Virgin America,cairdin,@VirginAmerica What @dhepburn said.
1,positive,0.3486,Virgin America,jnardino,@VirginAmerica plus you've added commercials t...
2,neutral,0.6837,Virgin America,yvonnalynn,@VirginAmerica I didn't today... Must mean I n...
3,negative,1.0,Virgin America,jnardino,@VirginAmerica it's really aggressive to blast...
4,negative,1.0,Virgin America,jnardino,@VirginAmerica and it's a really big bad thing...


In [31]:
# Polarity & Subjectivity Using TextBlob sentiment
# Basic Sentiment Analysis
# Using the TextBlob sentiment method
# TextBlob has a sentiment method that can be used on any TextBlob object. It returns two values:

# polarity: value in range [-1, 1], indicating how negative or positive the text is (close to 0.0 is neutral).
# subjectivity: value in range [0, 1], indicating how subjective the text is (1 is very subjective)
# This method is very basic, and there is a lot to be desired, but it can still be helpful if you don't have opportunity to train a classifier, and just need some rough results.

lines = ["The food is on the table", "The food is green", "I don't like the food",
         "I do not like the food", "I like the food", "I don't love the food", "I do not love the food",
         "I hate the food", "I love the food", "The food is delicious"]

# analyze the sentences
sentiments = [b.sentiment for b in [TextBlob(l) for l in lines]]
for l,s in zip(lines, sentiments):
    print('{} \n(p={}, s={})'.format(l, s[0], s[1]), '\n')
    
# As seen above, this method doesn't recognize negative contractions (e.g. don't), and it has trouble with ambiguous works that can take on multiple meanings (e.g. like, which is also used for comparision)

# Let's see how it does with a couple book reviews.

The food is on the table 
(p=0.0, s=0.0) 

The food is green 
(p=-0.2, s=0.3) 

I don't like the food 
(p=0.0, s=0.0) 

I do not like the food 
(p=0.0, s=0.0) 

I like the food 
(p=0.0, s=0.0) 

I don't love the food 
(p=0.5, s=0.6) 

I do not love the food 
(p=-0.25, s=0.6) 

I hate the food 
(p=-0.8, s=0.9) 

I love the food 
(p=0.5, s=0.6) 

The food is delicious 
(p=1.0, s=1.0) 



In [32]:
# Using The sentiment Method on Tweets
# We will get a subset of our data that contains only the first 10 rows that have a confidence level greater that 0.6. This is because we are uninterested in entries with a high level of uncertainty, because keeping low-confidence observations would reduce the certainty of evaluations that we will make later.

subset = data[data.airline_sentiment_confidence > 0.6]\
    .head(10).copy().reset_index(drop=True)
tweets = subset.text

In [6]:
subset

Unnamed: 0,airline_sentiment,airline_sentiment_confidence,airline,name,text
0,neutral,1.0,Virgin America,cairdin,@VirginAmerica What @dhepburn said.
1,neutral,0.6837,Virgin America,yvonnalynn,@VirginAmerica I didn't today... Must mean I n...
2,negative,1.0,Virgin America,jnardino,@VirginAmerica it's really aggressive to blast...
3,negative,1.0,Virgin America,jnardino,@VirginAmerica and it's a really big bad thing...
4,negative,1.0,Virgin America,jnardino,@VirginAmerica seriously would pay $30 a fligh...
5,positive,0.6745,Virgin America,cjmcginnis,"@VirginAmerica yes, nearly every time I fly VX..."
6,neutral,0.634,Virgin America,pilot,@VirginAmerica Really missed a prime opportuni...
7,positive,0.6559,Virgin America,dhepburn,"@virginamerica Well, I didn't…but NOW I DO! :-D"
8,positive,1.0,Virgin America,YupitsTate,"@VirginAmerica it was amazing, and arrived an ..."
9,neutral,0.6769,Virgin America,idk_but_youtube,@VirginAmerica did you know that suicide is th...


In [33]:
# Compare the sentiment predictions with each line in subset
# We want to get a sense of how each tweet is being classified

# print the tweets and predicted polarity line-by-line
for i,t in enumerate(tweets):
    s = TextBlob(t).sentiment
    target = subset.airline_sentiment[i]
    print(t, '\n', '{} (target: {}) \n'.format(s[0], target))
    
# This basic sentiment analyzer missed the mark on 3/10 tweets (2 neutral and 1 negative). That's not too bad, but these results are nothing to celebrate. The perfmance declines quite a bit with larger texts.

# Looking at the two tweets the sentiment method estimated incorrectly:

# @VirginAmerica I didn't today... Must mean I need to take another trip! This one is interpreted by the computer as negative, and perhaps it's correct. This one is full of ambiguity without any context, and that is probably why the target value in the set is neutral.

# @VirginAmerica it's really aggressive to blast obnoxious "entertainment" in your guests' faces & they have little recourse This one is

@VirginAmerica What @dhepburn said. 
 0.0 (target: neutral) 

@VirginAmerica I didn't today... Must mean I need to take another trip! 
 -0.390625 (target: neutral) 

@VirginAmerica it's really aggressive to blast obnoxious "entertainment" in your guests' faces &amp; they have little recourse 
 0.0062500000000000056 (target: negative) 

@VirginAmerica and it's a really big bad thing about it 
 -0.3499999999999999 (target: negative) 

@VirginAmerica seriously would pay $30 a flight for seats that didn't have this playing.
it's really the only bad thing about flying VA 
 -0.2083333333333333 (target: negative) 

@VirginAmerica yes, nearly every time I fly VX this “ear worm” won’t go away :) 
 0.4666666666666666 (target: positive) 

@VirginAmerica Really missed a prime opportunity for Men Without Hats parody, there. https://t.co/mWpG7grEZP 
 0.2 (target: neutral) 

@virginamerica Well, I didn't…but NOW I DO! :-D 
 1.0 (target: positive) 

@VirginAmerica it was amazing, and arrived an hour e

In [34]:
# Analyze polarity of each word in the last sentence above to see what's happening

words = TextBlob(tweets[2]).words
for w in words: print(w, TextBlob(w).sentiment[0], '\n')
    
# We can see that the sentiment method does not consider the words "obnoxious" or "aggressive" to be negative, which is a glaring problem for our analysis. This method is clearly limited and we need a better method.

VirginAmerica 0.0 

it 0.0 

's 0.0 

really 0.2 

aggressive 0.0 

to 0.0 

blast 0.0 

obnoxious 0.0 

entertainment 0.0 

in 0.0 

your 0.0 

guests 0.0 

faces 0.0 

amp 0.0 

they 0.0 

have 0.0 

little -0.1875 

recourse 0.0 



In [35]:
# Create train and test sets
# train the model on the first set
# test/evaluate it on the other
# The set below named reduced is reduced in dimensionality (keeping only the features/columns we care about).

# The train and test sets are created using something called a list comprehension. If you don't know what that is, it's okay, and you can look it up later. What is important is to know that the Naive Bayes classifier takes data in the form of a list of doubles, where each double is one observation (text, label), where label is the class label that belongs to the text.


# get reduced set
reduced = data.ix[:, ['airline_sentiment','text']].copy()
reduced.rename(columns={'airline_sentiment': 'target'}, inplace=1)

# now create train and test sets for first 500 tweets
# for the TextBlob classifier we need a list of doubles (string, target)
train = [(s, t) for s,t in zip(reduced.iloc[:350].text, reduced.iloc[:350].target)]
test = [(s, t) for s,t in zip(reduced.iloc[350:500].text, reduced.iloc[350:500].target)]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  # Remove the CWD from sys.path while we load stuff.


In [36]:
# import the classifier
from textblob.classifiers import NaiveBayesClassifier

# train
cl = NaiveBayesClassifier(train)
# evaluate
cl.accuracy(test)

0.6266666666666667

In [37]:
# a quick look at the distribution of class labels
reduced.target.value_counts()

negative    9178
neutral     3099
positive    2363
Name: target, dtype: int64

In [38]:
# create a score function that will give precision and recall values for each class
def score(true, predicted):
    eq = np.equal
    
    t = np.array(true)
    p = np.array(predicted)
    
    tp = np.array([eq((t == c)*(p == c), 1).sum() for c in np.unique(t)])
    fp = np.array([eq((t != c)*(p == c), 1).sum() for c in np.unique(t)])
    fn = np.array([eq((t == c)*(p != c), 1).sum() for c in np.unique(t)])

    precision = tp/(tp + fp)
    recall = tp/(tp + fn)
    
    return (np.unique(t), precision, recall)

In [39]:
# Evaluate classifier on larger set
# * skip this; takes too long *

# With train/test split

# create new train and test sets
# for the TextBlob classifier we need a list of doubles (string, target)

train = [(s, t) for s,t in zip(reduced.iloc[:1500].text, reduced.iloc[:1500].target)]
test = [(s, t) for s,t in zip(reduced.iloc[1500:2000].text, reduced.iloc[1500:2000].target)]

In [40]:
# train
cl = NaiveBayesClassifier(train)

# evaluate
cl.accuracy(test)
# 0.786

0.788

In [41]:
# Naive Bayes Classifier: Digging Deeper
# Making Predictions
# NaiveBayesClassifier has a classify method that takes text (a single string) as an argument. This means that we can either classify some string that we choose to type by hand, or classify tweets from our test set individually.

cl.classify('I love this airline')

'positive'

In [42]:
# Getting class probabilities

probs = cl.prob_classify('I love this airline')
probs.max()

'positive'

In [43]:
probs.prob('positive')

0.62832147146215

In [44]:
probs.prob('negative')

# The above can be useful if you want to make modifications to how something is classified by setting a threshold. For example, you may want to only classify something as positive if the probability exceeds 0.9, instead of it simply having the highest probability.

0.06092324050268474

In [45]:
# Informative Features
# The method below gives us some insight into how the classifier is making decisions. For example, we can see that if a string contains the word "great", the there is are 8.7:1 odds that the string is positive instead of negative. All of the features are taken into account for one string, so that doesn't mean just because "great" is in the string it will be classified as positive.

cl.show_informative_features(10)

# How to interpret this:

# We are given rows that have contains(feature) = True/False and a comparison of two class labels with a ratio that indicates how likely one is over the other
# The printed results are in descending order of importance
# Ex: contains(no) = True gives the ratio of 9.7 : 1.0, showing that it is extremely likely to be negative rather than neutral
# The default features for the Naive Bayes classifier are individual words found in the data

Most Informative Features
         contains(thank) = True           positi : negati =     25.3 : 1.0
            contains(no) = True           negati : positi =     21.0 : 1.0
       contains(amazing) = True           positi : negati =     18.4 : 1.0
          contains(love) = True           positi : negati =     16.2 : 1.0
          contains(been) = True           negati : neutra =     14.0 : 1.0
       contains(delayed) = True           negati : neutra =     13.3 : 1.0
           contains(She) = True           positi : negati =     11.9 : 1.0
         contains(Thank) = True           positi : negati =     10.3 : 1.0
        contains(please) = True           neutra : positi =      9.8 : 1.0
          contains(best) = True           positi : negati =      9.7 : 1.0


In [46]:
# Extracting Features
# We are provided a method that serves one purpose: take a string and return a dictionary of all features in our classifier (individual words by default), and whether or not that word is in the string. It is essentially a binary feature vector.

cl.extract_features('I have no idea where this flight is taking me')

{'contains(reinstated)': False,
 'contains(face)': False,
 'contains(mobile)': False,
 'contains(illegal)': False,
 'contains(TRUU_Tall)': False,
 'contains(understaffed)': False,
 'contains(nite)': False,
 'contains(Just)': False,
 'contains(copy/paste)': False,
 'contains(impressed)': False,
 'contains(airport_fra)': False,
 'contains(afford)': False,
 'contains(avoided)': False,
 'contains(disability)': False,
 'contains(Boston)': False,
 'contains(Thankyou)': False,
 'contains(Hey)': False,
 'contains(saves)': False,
 'contains(location)': False,
 'contains(Really)': False,
 'contains(Can)': False,
 'contains(hard-earned)': False,
 'contains(representatives)': False,
 'contains(so)': False,
 'contains(lostluggage)': False,
 'contains(any)': False,
 'contains(inept)': False,
 'contains(LAS-DEN)': False,
 'contains(mech)': False,
 'contains(inadequate)': False,
 'contains(POS)': False,
 'contains(why)': False,
 'contains(Rozana)': False,
 'contains(Requested)': False,
 'contains(refe

In [47]:
# Classifying From Within a TextBlob
# We can perform classification on the contents of a TextBlob object using an existing classifier (like the one we created earlier (named cl). The usefulness of this might seem questionable, since you can just pass a normal string to the classifier. However, something you will be doing other work with some text in the form of a blob, and then when you need to perform classification, you don't have to go back and get the raw string.

# Using a clssifier in a TextBlob is as easy as passing the classifier as an argument when you create the blob.

# Note: The classifier must be one that you have already trained.

# Let's look at a couple examples:

b = TextBlob('I loved the flight', classifier=cl)
b.classify()

'negative'

In [48]:
b = TextBlob('I hated the flight', classifier=cl)
b.classify()

# Our classifier probably didn't encounter the word "hate" or "hated". We can update our model to improve classification.

'neutral'

In [49]:
# Update Existing Classifiers With New Data
# Our classifier obviously failed us when we tried to classify the string "I hate this flight." We have the option of easily updating our classifier with new data, so let's do that now.

# new data is also a list of tuples
# be sure the class labels are correct
updates = [('I hated flying', 'negative'), ('I hate flying', 'negative'),
           ('I hate this airline', 'negative'), ('I hated the seats', 'negative')]
cl.update(updates)  # this is unfortunately slow

# You can ignore the output True
# Note: If you get the error too many values to unpack (expected 2), try re-running the cell where we created the train/test sets and create/train the classifier from scratch.

True

In [50]:
# Now that we have updated our classifier with new data, let's see how our original sentence is classified.
# let's see how it does now using 'I hated the flight'
b = TextBlob('I hated the flight', classifier=cl) # update
b.classify()

# An now we have the correct classification of 'negative'
# If you do not get the correct class, try running the update cell once more.

# Other Classifiers
# TextBlob has a number of built in classifiers, all of which can be found in the documentation at the link below.

# http://textblob.readthedocs.io/en/dev/api_reference.html#api-classifiers

'negative'