In [None]:
import numpy as np
import pandas as pd
import string

import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)
import matplotlib as plt

from sklearn.manifold import TSNE
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

import heapq
from collections import Counter, OrderedDict

from wordcloud import WordCloud
import re, nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from textblob import TextBlob, Word
from nltk.stem.porter import PorterStemmer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding

import json

In [None]:
df = pd.read_csv('train.csv', sep = '|', names = ['stars', 'text'], error_bad_lines=False)

In [None]:
df = df['stars', 'text', 'cool', 'useful', 'funny']

In [None]:
df = df[(df['stars'] == 5) | (df['stars'] == 1)]
df

### Sentiment Analysis + TextBlob

In [None]:
review = TextBlob(yelp_review_df.text[0])

In [None]:
print("Polarity:", review.sentiment.polarity)

In [None]:
print("Subjectivity:", review.sentiment.subjectivity)

##### Calculating TextBlob Score

In [None]:
df_textblob['texblob_sentiment_score'] = df.text.apply(lambda x : TextBlob(x.decode('utf-8')).sentiment.polarity)

In [None]:
df_textblob[df_textblob['stars'] == 1]

In [None]:
# check extremely negative review
mins = df_textblob['texblob_sentiment_score'].min()
df_textblob[df_textblob['texblob_sentiment_score'] = mins]['text']

In [None]:
df_textblob[df_textblob['stars'] == 5]

In [None]:
# check extremely positive review
maxs = df_textblob['texblob_sentiment_score'].max()
df_textblob[df_textblob['texblob_sentiment_score'] = maxs]['text']

In [None]:
[mins, maxs]

In [None]:
yelp_review_df['textblob_predicted_stars'] = yelp_review_df['texblob_sentiment_score'].apply(lambda x : 5 if x > 0.1 else 1)

##### Performance

### Star Prediction by Texblob Score

In [None]:

yelp_review_df['textblob_predicted_stars'] = yelp_review_df['texblob_sentiment_score'].apply(lambda x : 5 if x > 0.1 else 1)

In [None]:

print 'Textblob Accuracy', metrics.accuracy_score(yelp_review_df['stars'], yelp_review_df['textblob_predicted_stars'])

In [None]:
##### Example

In [None]:
yelp_review_df[yelp_review_df['stars']==5].head(3)

In [None]:
yelp_review_df[yelp_review_df['stars']==1].head(3)

# Predicting - Logistic Regression

### Binary Classification

- stars 1 : negative - 0
- stars 5: positive - 1
- stars 2 & 3 & 4: neutral

### Data Preparing: tokenization

In [None]:
def text_preparation(text):
    
    ## Remove puncuation
    text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    ## Remove stop words
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    text = text.split()
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)

    return text

In [None]:
df['text'] = df['text'].map(lambda x: text_preparation(x))

In [None]:
vocabulary_size = 20000
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(df['text'])

sequences = tokenizer.texts_to_sequences(df['text'])
df = pad_sequences(sequences, maxlen=50)

In [None]:
count_vect = CountVectorizer(analyzer = 'word', tokenizer=simple_tokenizer, lowercase=True)

In [None]:
review_tf = count_vect.fit_transform(yelp_review_df['text'])

### BOW + Logistic Regression

In [None]:
# Numpy arrays are easy to work with, so convert the result to an array
review_tf_nd = review_tf.toarray()
df = pd.DataFrame(review_tf_nd, columns=count_vect.get_feature_names())
df

In [None]:
vocab = count_vect.get_feature_names()
vocab[:100]

In [None]:
vocab[-100:]

##### Postive reviews

In [None]:
# calculate frequency distribution of words in positive reviews
dist = np.sum(review_tf_nd[np.where(yelp_review_df['stars'] == 5)], axis=0)
counter = Counter()
# For each, print the vocabulary word and the number of times it appears in the data set
for tag, count in zip(vocab, dist):
    counter[tag] = count
counter.most_common(20)

In [None]:
wordcloud = WordCloud().fit_words(counter.most_common(20))
plt.axis("off")
plt.imshow(wordcloud)

##### Negative Reviews

In [None]:
dist = np.sum(review_tf_nd[np.where(yelp_review_df['stars'] == 1)], axis=0)
counter = Counter()
for tag, count in zip(vocab, dist):
    counter[tag] = count

In [None]:
wordcloud = WordCloud().fit_words(counter.most_common(20))
plt.axis("off")
plt.imshow(wordcloud)

##### Star Prediction - Logistic Regression

In [None]:
X_train, X_test, y_train, y_test  = train_test_split(review_tf_nd, yelp_review_df['stars'], train_size=0.75)
clf = LogisticRegression()
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)

In [None]:
from sklearn import metrics
print(metrics.classification_report(y_test, y_pred))


In [None]:
print("accuracy: %0.6f" % metrics.accuracy_score(y_test, y_pred))


In [None]:

from pylab import rcParams
labels = ['5', '1']
confmat = metrics.confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(3, 3))
ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.7)
for i in range(confmat.shape[0]):
    for j in range(confmat.shape[1]):
        ax.text(x=j, y=i, s=confmat[i, j], va='center', ha='center')
tick_marks = np.arange(len(labels))
plt.xticks(tick_marks, labels, rotation=45)
plt.yticks(tick_marks,labels)
plt.xlabel('predicted label')
plt.ylabel('true label')
plt.tight_layout()

### TF-IDF + Logistic Regression

In [None]:
tfidf_vect = TfidfVectorizer(analyzer = 'word', tokenizer=lemma_tokenizer, lowercase=True)
review_tf = tfidf_vect.fit_transform(yelp_review_df.text)

In [None]:
review_tf_nd = review_tf.toarray()
review_tf_nd.shape

##### Word Clouds

In [None]:
dist = np.sum(review_tf_nd[np.where(yelp_review_df['stars'] == 5)], axis=0)

counter = Counter()
for tag, count in zip(vocab, dist):
    counter[tag] = count
    
wordcloud = WordCloud().fit_words(counter.most_common(20))
plt.axis("off")
plt.imshow(wordcloud)

In [None]:
dist = np.sum(review_tf_nd[np.where(yelp_review_df['stars'] == 1)], axis=0)

counter = Counter()
for tag, count in zip(vocab, dist):
    counter[tag] = count
    
wordcloud = WordCloud().fit_words(counter.most_common(20))
plt.axis("off")
plt.imshow(wordcloud)

### Star Prediction - Logistic Regression

In [None]:
X_train, X_test, y_train, y_test  = train_test_split(review_tf_nd, yelp_review_df['stars'], train_size=0.75)

In [None]:
X_train, X_test, y_train, y_test  = train_test_split(review_tf_nd, yelp_review_df['stars'], train_size=0.75)
clf = LogisticRegression()
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)

In [None]:
from sklearn import metrics
print(metrics.classification_report(y_test, y_pred))


In [None]:
print("accuracy: %0.6f" % metrics.accuracy_score(y_test, y_pred))


In [None]:
from pylab import rcParams
labels = ['5', '1']
confmat = metrics.confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(3, 3))
ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.7)
for i in range(confmat.shape[0]):
    for j in range(confmat.shape[1]):
        ax.text(x=j, y=i, s=confmat[i, j], va='center', ha='center')
tick_marks = np.arange(len(labels))
plt.xticks(tick_marks, labels, rotation=45)
plt.yticks(tick_marks,labels)
plt.xlabel('predicted label')
plt.ylabel('true label')
plt.tight_layout()

# Prediction Model Comparison
##### ROC

In [None]:
sns.set(style='whitegrid', context='notebook')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='NULL Accuracy')
plt.plot(fpr_tfidf, tpr_tfidf, lw=1, label='Logistic Regression TF-IDF (AUC = %0.6f)' % roc_auc_tfidf)
plt.plot(fpr_tf, tpr_tf, lw=1, label='Logistic Regression Bag-Of-Words (AUC = %0.6f)' % roc_auc_tf)
plt.title('ROC Sentiment Classifier')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.legend(loc="lower right")
plt.grid(True)