# Product Sentiment Classifier

## Data Cleaning and EDA
Import necessary libraries

In [1]:
import pandas as pd
import re
import numpy as np
import nltk
from nltk import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import FreqDist
from nltk import FreqDist
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (classification_report, 
                             plot_confusion_matrix)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
%matplotlib inline

import tensorflow as tf
import keras
from keras import models
from keras import layers
from keras import optimizers

from imblearn.over_sampling import SMOTE

%run -i "clean_lemmatize_token.py"
%run -i "report.py"

Using TensorFlow backend.


Import data from data.world.

In [2]:
df = pd.read_csv('https://query.data.world/s/zbehvjkmiewbkln44rae6iphum4v3g', 
                 encoding = "ISO-8859-1")
df.head()

URLError: <urlopen error [Errno 11001] getaddrinfo failed>

Rename columns

In [None]:
df=df.rename(columns = {'emotion_in_tweet_is_directed_at':
                        'brand_product',
                        'is_there_an_emotion_directed_at_a_brand_or_product':
                        'sentiment'})
df.head()

Explored data set with info() method. 1 NaN value present in tweet_text column and ~6,000 in brand_product column. Will need to address prior to modeling

In [None]:
df.info()

Remove NaN tweet_text from DataFrame

In [None]:
df[df['tweet_text'].isna()]

In [None]:
df.drop(inplace= True, index=6)

### Exploration
Explored Sentiment category. Most tweets are marked as having No emotion which will not help initial binary classification model. Most data in data set will only be usuable when model is built to take into account neutral sentiment

In [None]:
df.groupby('sentiment').count()

In [None]:
df.loc[df['sentiment'] == "I can't tell"]

In [None]:
df['sentiment'].value_counts()

Dummied Sentiment Column to help with visulizations to compare sentiment across brands

In [None]:
df1=pd.get_dummies(df['sentiment'])
df1.head()

In [None]:
df_dummied=df.join(df1).drop(columns='sentiment')
df_dummied.head()

Made visualization to explore distribution of sentiment across brand/product. Will combine Apple and Google products to further explore distribution. Sentiment is overwhelmingly positive across all products and most sentiment data is logged for Apple products

In [None]:
df_dummied.groupby('brand_product').sum().plot(kind='barh', 
                                               figsize=(10,7))
plt.title('Sentiment Analysis by Brand/Product', size=20)
plt.ylabel('Brand/Product', size=15)
plt.xlabel('# of Instances', size=15)
plt.savefig('figures/SentimentbyProduct.png')
plt.show()

Made below visualization to explore the missing brand_product classifications for each sentiment

In [None]:
df.groupby('sentiment').count().plot(kind='barh',
                                     figsize=(10,5))
plt.title('Sentiment Distribution', size=20)
plt.ylabel('Sentiment', size=15)
plt.xlabel('# of Instances',size=15)
plt.show()

Usable data (Positive or Negative sentiment) for baseline first model is 39.02%. Will be necessary to eventually build a multiclass classifier with No Emotion classification

In [None]:
usable_data=round(len(df.loc[(df['sentiment'] == 'Positive emotion') | 
                             (df['sentiment'] == 'Negative emotion')])/len(df) * 100,2)
print('Percentage of Data with either Positive or Negative Sentiment: {}%'.format(usable_data))

Combined Apple and Google product names together to explore further visualizations

In [None]:
df_dummied['brand_product']=df_dummied['brand_product'].replace(to_replace = ['iPad','Apple',
                                                                                    'iPad or iPhone App','iPhone',
                                                                                    'Other Apple product or service'],value='apple_product')
df_dummied['brand_product'] = df_dummied['brand_product'].replace(['Google',
                                                                        'Other Google product or service',
                                                                        'Android App', 'Android'],'android_product')
df_dummied['brand_product'].value_counts()

Combined Brand_product columns to show difference in amount of information for each product

In [None]:
df_dummied.groupby('brand_product').sum().plot(kind='barh', figsize=(10,4))
plt.title('Sentiment Analysis by Brand/Product Combined', size=20)
plt.ylabel('Brand/Product', size=15)
plt.xlabel('# of Instances', size=15)
plt.yticks(np.arange(2),['Google Product','Apple Product'])
plt.show()

### Preprocessing Tweets
Clean, lemmatize, and format data for vectorization and modeling

In [None]:
df['tweet_text'] = df['tweet_text'].map(clean_lemmatize_token)

### binary classification model
Split the DataFrame to take only binary options 

In [None]:
binary = df[(df['sentiment'] == 'Negative emotion')|
            (df['sentiment'] == 'Positive emotion')]

vectorize the data

In [None]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(binary['tweet_text'])
Y = binary['sentiment']
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2)

### Logistic Regression

In [None]:
params= {'solver': ['lbfgs', 'liblinear', 'sag', 'saga'], 
         'max_iter': [2000, 2200],
         'C': [.2, .4, .6, .8, 1], 
         'tol': [.0001, .001, .01],
         'class_weight': ['balanced', None]}
log_grid = GridSearchCV(LogisticRegression(), 
                        param_grid= params)
report(log_grid)

### Decision Tree Classifier

In [None]:
params= {'criterion': ['gini', 'entropy'], 
         'splitter': ['best', 'random'], 
         'max_depth': [50, 80, 100, None], 
         'class_weight': ['balanced', None]}
grid_tree = GridSearchCV(DecisionTreeClassifier(), 
                         param_grid=params) 

report(grid_tree)

### Recurrent Neural Networks

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer(nb_words = 200)
tokenizer.fit_on_texts(binary['tweet_text'].values)
X = tokenizer.texts_to_sequences(binary['tweet_text'].values)
X = pad_sequences(X)

In [None]:
lstm = tf.keras.layers.LSTM(4)
output = lstm(X_train)

In [None]:
model = models.Sequential()
model.add(layers.Dense(50, activation='relu'))
model.add(layers.Dense(25, activation='relu'))
model.compile(optimizer='SGD', 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])
history = model.fit(X_train,
                    y_train,
                    epochs=120,
                    batch_size=256)
history_dict = history.history
history_dict = history.history
loss_values = history_dict['loss']

epochs = range(1, len(loss_values) + 1)
plt.plot(epochs, loss_values, 'g', label='Training loss')

plt.title('Training loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()