# Sentiment Analysis of Tweets

** Thank you to Lukas Biewald for the Machine Learning class. **

The original tutorial is here:
    - https://s3.amazonaws.com/ai-learn-l2k/ML_Course.pdf
    - https://github.com/lukas/ml-class

## Feature Extraction

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('tweets.csv')
target = df['is_there_an_emotion_directed_at_a_brand_or_product']
text = df['tweet_text']

fixed_text = text[pd.notnull(text)]
fixed_target = target[pd.notnull(text)]

from sklearn.feature_extraction.text import CountVectorizer

count_vect=CountVectorizer()
count_vect.fit(fixed_text)

print(count_vect.vocabulary_.get(u'iphone'))

4573


In [2]:
# turns the text into a sparse matrix
counts = count_vect.transform(fixed_text)
print(fixed_text[0:2])
print(counts[0:2])

0    .@wesley83 I have a 3G iPhone. After 3 hrs twe...
1    @jessedee Know about @fludapp ? Awesome iPad/i...
Name: tweet_text, dtype: object
  (0, 168)	1
  (0, 430)	1
  (0, 774)	2
  (0, 2291)	1
  (0, 3981)	1
  (0, 4210)	1
  (0, 4573)	1
  (0, 4610)	1
  (0, 5766)	1
  (0, 6478)	1
  (0, 7232)	1
  (0, 8076)	1
  (0, 8323)	1
  (0, 8702)	1
  (0, 8920)	1
  (0, 9062)	1
  (0, 9303)	1
  (0, 9373)	1
  (1, 313)	1
  (1, 527)	1
  (1, 644)	1
  (1, 677)	1
  (1, 774)	1
  (1, 876)	1
  (1, 2386)	1
  (1, 3356)	1
  (1, 3401)	1
  (1, 3454)	1
  (1, 3685)	1
  (1, 4560)	1
  (1, 4573)	1
  (1, 4619)	1
  (1, 4678)	1
  (1, 4847)	1
  (1, 5042)	1
  (1, 5094)	1
  (1, 6913)	1
  (1, 8323)	1
  (1, 8560)	1
  (1, 8602)	1
  (1, 8870)	1
  (1, 9625)	1


In [3]:
# some other fun things to try
print(fixed_text[0])
print(count_vect.transform(["cerulean"]))
print(fixed_text[0:2])

.@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead!  I need to upgrade. Plugin stations at #SXSW.

0    .@wesley83 I have a 3G iPhone. After 3 hrs twe...
1    @jessedee Know about @fludapp ? Awesome iPad/i...
Name: tweet_text, dtype: object


## Feature Selection

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

p = Pipeline(steps=[('counts', CountVectorizer(ngram_range=(1, 2))),
                ('feature_selection', SelectKBest(chi2, k=10000)),
                ('multinomialnb', MultinomialNB())])

p.fit(fixed_text, fixed_target)

from sklearn.model_selection import cross_val_score

scores = cross_val_score(p, fixed_text, fixed_target, cv=10)
print(scores)
print(scores.mean())

[ 0.67032967  0.66813187  0.62087912  0.64285714  0.64945055  0.67912088
  0.67876788  0.6809681   0.66041896  0.63947078]
0.659039495078


## Classification

In [5]:
# Train with this data with a Naive Bayes classifier:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(counts, fixed_target)

#Try the classifier
print(nb.predict(count_vect.transform(['i love my iphone'])))

['Positive emotion']


In [6]:
# See what the classifier predicts for some new tweets:
for tweet in ('I love my iphone!!!', 'iphone costs too much!!!', 'the iphone is not good', 'I like turtles', 'I have a love/hate relationship with my iphone'):
  print('Tweet: ' + tweet)
  print('Prediction: ' + str(nb.predict(count_vect.transform([tweet]))))
  print('\n')

Tweet: I love my iphone!!!
Prediction: ['Positive emotion']


Tweet: iphone costs too much!!!
Prediction: ['Negative emotion']


Tweet: the iphone is not good
Prediction: ['Positive emotion']


Tweet: I like turtles
Prediction: ['No emotion toward brand or product']


Tweet: I have a love/hate relationship with my iphone
Prediction: ['Negative emotion']




## Test Algorithm

In [7]:
# See what the classifier predicts for some new tweets:
predictions = nb.predict(counts)
correct_predictions = sum(predictions == fixed_target)
incorrect_predictions = 9092 - correct_predictions  # (there are 9,092 tweets in the csv)

print('# of correct predictions: ' + str(correct_predictions))
print('# of incorrect predictions: ' + str(incorrect_predictions))
print('Percent correct: ' + str(100.0 * correct_predictions / (correct_predictions + incorrect_predictions)))

# of correct predictions: 7229
# of incorrect predictions: 1863
Percent correct: 79.5094588649


In [8]:
nb.fit(counts[0:6000], target[0:6000])

# See what the classifier predicts for some new tweets:
# (Tweets 6000 to 9091 are used for testing)
predictions = nb.predict(counts[6000:9092])
correct_predictions = sum(predictions == fixed_target[6000:9092])
incorrect_predictions = (9092 - 6000) - correct_predictions
print('# of correct predictions: ' + str(correct_predictions))
print('# of incorrect predictions: ' + str(incorrect_predictions))
print('Percent correct: ' + str(100.0 * correct_predictions / (correct_predictions + incorrect_predictions)))

# of correct predictions: 1797
# of incorrect predictions: 1295
Percent correct: 58.1177231565


In [9]:
from sklearn.metrics import confusion_matrix
## We're ignoring "I can't tell" here for simplicity
label_list = ['Positive emotion', 'No emotion toward brand or product', 'Negative emotion'] 
cm = confusion_matrix(target[6000:9092], predictions, labels=label_list)
print("Labels in data:")
print(label_list)
print("Rows: actual labels, Columns: Predicted labels")
print(cm)

Labels in data:
['Positive emotion', 'No emotion toward brand or product', 'Negative emotion']
Rows: actual labels, Columns: Predicted labels
[[ 229  743   15]
 [ 407 1474    9]
 [  43  119    0]]


#### Use DummyClassifer as a simple baseline to compare with Naive Bayes

In [10]:
from sklearn.dummy import DummyClassifier
nb = DummyClassifier(strategy='most_frequent')
nb.fit(counts, fixed_target)
print(nb.predict(count_vect.transform(['i love my iphone'])))

['No emotion toward brand or product']


In [11]:
predictions = nb.predict(counts)

correct_predictions = sum(predictions == fixed_target)
incorrect_predictions = 9092 - correct_predictions  # (there are 9,092 tweets in the csv)
print('# of correct predictions: ' + str(correct_predictions))
print('# of incorrect predictions: ' + str(incorrect_predictions))
print('Percent correct: ' + str(100.0 * correct_predictions / (correct_predictions + incorrect_predictions)))

# of correct predictions: 5388
# of incorrect predictions: 3704
Percent correct: 59.2608886934


In [12]:
nb.fit(counts[0:6000], fixed_target[0:6000])

# See what the classifier predicts for some new tweets:
# (Tweets 6000 to 9091 are used for testing)
predictions = nb.predict(counts[6000:9092])
correct_predictions = sum(predictions == fixed_target[6000:9092])
incorrect_predictions = (9092 - 6000) - correct_predictions
print('# of correct predictions: ' + str(correct_predictions))
print('# of incorrect predictions: ' + str(incorrect_predictions))
print('Percent correct: ' + str(100.0 * correct_predictions / (correct_predictions + incorrect_predictions)))

# of correct predictions: 1890
# of incorrect predictions: 1202
Percent correct: 61.1254851229


In [13]:
from sklearn.metrics import confusion_matrix
## We're ignoring "I can't tell" here for simplicity
label_list = ['Positive emotion', 'No emotion toward brand or product', 'Negative emotion'] 
cm = confusion_matrix(fixed_target[6000:9092], predictions, labels=label_list)
print("Labels in data:")
print(label_list)
print("Rows: actual labels, Columns: Predicted labels")
print(cm)

Labels in data:
['Positive emotion', 'No emotion toward brand or product', 'Negative emotion']
Rows: actual labels, Columns: Predicted labels
[[   0  988    0]
 [   0 1890    0]
 [   0  162    0]]


#### Use Perceptron linear mode as comparison

In [17]:
from sklearn.linear_model import Perceptron

perceptron = Perceptron()

from sklearn.model_selection import cross_val_score

scores = cross_val_score(perceptron, counts, fixed_target, cv=10)
print(scores)
print(scores.mean())

[ 0.63516484  0.65274725  0.64175824  0.63626374  0.62087912  0.61428571
  0.62706271  0.58305831  0.53031974  0.45644983]
0.599798948321


In [20]:
from sklearn import datasets

digits = datasets.load_digits()
scores = cross_val_score(perceptron, digits.data, digits.target, cv=10)
print(scores)
print(scores.mean())

[ 0.8972973   0.90163934  0.83977901  0.92222222  0.91620112  0.92178771
  0.93296089  0.92696629  0.85310734  0.88636364]
0.899832486311
