In [None]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
from textblob import TextBlob
import sklearn
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix

np.random.seed(47)

#reviews = pd.read_csv('./socialmedia-disaster-tweets-DFE.csv')
#print reviews.head()
#print reviews.tail()
reviews = pd.read_csv('socialmedia-disaster-tweets-DFE.csv')[['text','choose_one']]
reviews.columns = ['tweet','sentiment']
reviews = reviews[(reviews['sentiment'] == 'Relevant') | (reviews['sentiment'] == 'Not Relevant')]
reviews = reviews.reset_index(drop=True)
#reviews.head()
reviews.tail()

In [None]:
reviews.iloc[3][0]

In [None]:

import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    return text

In [None]:

preprocessor("</a>This :) is :( a test :-)!")

In [None]:

reviews['tweet'] = reviews['tweet'].apply(preprocessor)

In [None]:
reviews.iloc[3][0]

In [None]:
import numpy as np

reviews = reviews.reindex(np.random.permutation(reviews.index))

print reviews.head()
print reviews.tail()

In [None]:

reviews.groupby('sentiment').describe()

In [None]:

reviews['length'] = reviews['tweet'].map(lambda text: len(text))
print reviews.head()

In [None]:
reviews.length.plot(bins=20, kind='hist')

In [None]:

reviews.length.describe()

In [None]:

print list(reviews.tweet[reviews.length < 60].index)
print list(reviews.tweet[reviews.length < 60])

In [None]:
%%time
reviews.hist(column='length', by='sentiment', bins=50)

In [None]:
def split_into_tokens(tweet):
    tweet = unicode(tweet, 'utf8')  # convert bytes into proper unicode
    return TextBlob(tweet).words

In [None]:

reviews.tweet.head()

In [None]:

reviews.tweet.head().apply(split_into_tokens)

In [None]:

TextBlob("hello world, how is it going?").tags  # list of (word, POS) pairs

In [None]:

import nltk
nltk.download('stopwords')

In [None]:

from nltk.corpus import stopwords

stop = stopwords.words('english')
stop = stop + [u'a',u'b',u'c',u'd',u'e',u'f',u'g',u'h',u'i',u'j',u'k',u'l',u'm',u'n',u'o',u'p',u'q',u'r',u's',u't',u'v',u'w',u'x',u'y',u'z']

In [None]:
def split_into_lemmas(tweet):
    tweet = unicode(tweet, 'utf8').lower()
    words = TextBlob(tweet).words
    # for each word, take its "base form" = lemma 
    return [word.lemma for word in words if word not in stop]

reviews.tweet.head().apply(split_into_lemmas)

In [None]:

%%time
bow_transformer = CountVectorizer(analyzer=split_into_lemmas).fit(reviews['tweet'])
print len(bow_transformer.vocabulary_)

In [None]:

review4 = reviews['tweet'][7000]
print review4

In [None]:

bow4 = bow_transformer.transform([review4])
print bow4

In [None]:

%%time
reviews_bow = bow_transformer.transform(reviews['tweet'])
print 'sparse matrix shape:', reviews_bow.shape
print 'number of non-zeros:', reviews_bow.nnz
print 'sparsity: %.2f%%' % (100.0 * reviews_bow.nnz / (reviews_bow.shape[0] * reviews_bow.shape[1]))

In [None]:

reviews_bow_train = reviews_bow[:8000]
reviews_bow_test = reviews_bow[8000:]
reviews_sentiment_train = reviews['sentiment'][:8000]
reviews_sentiment_test = reviews['sentiment'][8000:]

print reviews_bow_train.shape
print reviews_bow_test.shape

In [None]:

%time review_sentiment = MultinomialNB().fit(reviews_bow_train, reviews_sentiment_train)

In [None]:

print 'predicted:', review_sentiment.predict(bow4)[0]
print 'expected:', reviews.sentiment[7000]

In [None]:

predictions = review_sentiment.predict(reviews_bow_test)
print predictions

In [None]:

print 'accuracy', accuracy_score(reviews_sentiment_test, predictions)
print 'confusion matrix\n', confusion_matrix(reviews_sentiment_test, predictions)
print '(row=expected, col=predicted)'

In [None]:
print classification_report(reviews_sentiment_test, predictions)

In [None]:
def predict_review(new_review): 
    new_sample = bow_transformer.transform([new_review])
    print new_review, np.around(review_sentiment.predict_proba(new_sample), decimals=5),"\n"

predict_review('Car. disaster. Major. damage. to. property.')
predict_review('storm. ugly. . bad. Best! horrible. terrible. loss. cannot. Powerful. swift. vacate. Incredible. isolated.')
predict_review('Okay. Great.')

In [None]:
predict_review(' Cat stuck in a tree.')
predict_review('Car accident. Major damage to property.')
predict_review('I ate a sandwich last night.')
predict_review('Somehow, Mr. Dreyfuss maintains his sound comic timing even when Frank Oz''s antic direction calls for hand-waving hysteria.')