# NB_SentimentClassifier_AmazonReviews

Author: Frank Fichtenmueller <br>
Date: 06/09/2017<br>

This is an example implementation of Amazon Review Data

Programmatic Structure:
- Generate a csv readout of the Customer Reviews for the Product
- load the reviews into a dict for easy access


Given a list of review texts, we create a dictionary for each review {field_source: review_text}

In [None]:
import os
import csv

os.chdir()

amazon_reviews = []
target_labels = []

for infile in os.listdir(os.path.join(os.getcwd())):
    if infile.endswith('csv'):
        label = infile.split('.')[0]
        target_labels.append(label)
    
        with open(infile, 'rb') as csvfile:
            amazon_reader = csv.DictReader(csvfile, delimiter=',')
            infile_rows = [{label: row['review_text']} for row in amazon_reader]
        
        for doc in infile_rows:
            amazon_reviews.append(doc)
            
print('There are ', str(len(amazon_reviews)), ' total reviews')
print('The labels are ', ', '.join(target_labels), '.')

### Build the classifier

In [None]:
# Shuffle the training data
from random import shuffle
x = [amazon_reviews[i] for i in range(len(amazon_reviews))]
shuffle(x)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from operator import itemgetter

# Generate a train test split
data = np.array([''.join(el.values()) for el in x])
targets = np.array([''.join(el.keys()) for el in x])

X_train, X_test, y_train, y_test = train_test_split()

# Vectorize the training data to numerical representation
vectorizer = TfidfVectorizer.fit_transform(min_df=1,
                                           ngram_range=(1, 1),
                                           stop_words='english',
                                           strip_accents='unicode',
                                           norm='l2')
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.fit_transform(X_test)

In [None]:
# Train the classifier
clf_nb = MulitnomialNB().fit(X_train, y_train)
y_predicted = classifier.predict(X_test)

### Evaluate Classifier Performance

In [None]:
# Evaluate Performance
print('Precision:: ', str(metrics.precision_score(y_test, y_predicted)))
print('Recal:: ', str(metrics.recall_score(y_test, y_predicted)))
print('F1 Score:: ', str(metrics.f1_score(y_test, y_predicted)))

print('\n\nConfusion Matrix:::: ')
print(metrics.confusion_matrix(y_test, y_predicted))

In [None]:
# Evaluate the most important features per class
N = 10
vocabulary = np.array([t for t,i in sorted(vectorizer.vocabulary_.iteritems(), key=itemgetter(1))])