<h3>Make Training and Dev Data</h3>

In [1]:
import random

def csv_to_array(filename):
    file = open(filename, 'r')
    file_text = file.read()
    # Get rid of the first three, which are examples
    split_text = file_text.split('|')[3:]
    i = 0
    tuple_text = []
    while i < len(split_text) - 1:
        tuple_text.append((split_text[i+1], split_text[i], int(i/2)))
        i += 2
    
    return tuple_text
    
def make_training_and_dev_data(all_data):
    random.shuffle(all_data)
    # Make this number ridiculously high for out test
    training_ratio = .9
    return all_data[:int(len(all_data) * training_ratio)], all_data[int(len(all_data) * training_ratio):]


official_data = csv_to_array('yelp_data_official_training.csv')

for garbage in official_data:
    if garbage[1].isdigit() is False:
        print(garbage[1])
        break

training_data, dev_data = make_training_and_dev_data(official_data)

In [2]:
def reviews(data):
    reviews = []
    for i in range(len(data)):
        reviews.append(data[i][0])
    return reviews

def labels(data):
    reviews = []
    for i in range(len(data)):
        reviews.append(int(data[i][1]))
    return reviews

reviews_training = reviews(training_data)
label_training = labels(training_data)
reviews_dev = reviews(dev_data)
label_dev = labels(dev_data)

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(reviews_training)
X_train_counts.shape

from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(43200, 88900)

<h3>Naive Bayes Pipeline</h3>

In [4]:
import numpy as np
from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import MultinomialNB

text_clf_mnb = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),])

text_clf_mnb = text_clf_mnb.fit(reviews_training, label_training)
predicted_mnb = text_clf_mnb.predict(reviews_dev)
np.mean(predicted_mnb == label_dev)

0.70714434492813993

<h3>SVM Pipeline</h3>

In [5]:
# Thanks to the Sklearn documentation for helping us with a lot of this code
# http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

from sklearn.linear_model import SGDClassifier
svm_classifier = SGDClassifier(loss='hinge',
                               penalty='l2',
                               alpha=1e-3,
                               n_iter=10,
                               random_state=42)
text_clf_svm = Pipeline([('vect', CountVectorizer(ngram_range=(1, 3), max_df=0.9, binary=True, min_df=2, stop_words='english')),
#                      ('tfidf', TfidfTransformer()), 
                     ('clf', svm_classifier),])
text_clf_svm = text_clf_svm.fit(reviews_training, label_training)
predicted_svm = text_clf_svm.predict(reviews_dev)
np.mean(predicted_svm == label_dev)

0.89710476983961673

<h3>Load Test Data</h3>

In [6]:
import re

def test_file_to_array(filename):
    file = open(filename, 'r')
    file_text = file.read()
    # Skip the first one because it's the header starter garbage
    split_text = re.split('\d+\|', file_text)[1:]
    tupled_text = []
    for i in range(len(split_text)):
        tupled_text.append((split_text[i], i))        
    return tupled_text

test_array = test_file_to_array('yelp_data_official_test_nocategories.csv')

<h3>Make Test Predictions For All Models</h3>

In [7]:
def test_prediction(model):
    output = []
    test_reviews = []
    test_id = []
    
    for i in test_array:
        test_reviews.append(i[0])
        test_id.append(i[1])
        
    if str(model) == 'svm':
        predicted = text_clf_svm.predict(test_reviews)
        
    elif model == 'mnb':
        predicted = text_clf_mnb.predict(test_reviews)
        
    for i in range(0,len(predicted)):
        output.append((int(i),predicted[i]))
    
    return output

output_mnb = test_prediction('mnb')
output_svm = test_prediction('svm')

<h3>Write Output to CSV: Specify Model</h3>

In [8]:
def write_output_to_csv(output, filename):
    file = open(filename, 'w')
    file.write('Id' + ',' + 'Category' + '\n' )
    if str(output) == str(output_mnb):
        for item in output_mnb:
            file.write(str(item[0]) + ',' + str(item[1]) + '\n')
    elif str(output) == str(output_svm):
        for item in output_svm:
            file.write(str(item[0]) + ',' + str(item[1]) + '\n')

write_output_to_csv(output_svm, 'kegel_boys_7.csv')

<h3>Classification Report and Confustion Matrix</h3>

In [9]:
from sklearn import metrics
print('--------SVM-------' + '\n')
print(metrics.classification_report(label_dev, predicted_svm)+ '\n')
print('--------MNB-------' + '\n')
print(metrics.classification_report(label_dev, predicted_mnb)+ '\n')

--------SVM-------

             precision    recall  f1-score   support

          1       0.93      0.94      0.94      1527
          2       0.86      0.87      0.86       806
          3       0.92      0.93      0.93      1135
          4       0.82      0.83      0.83       751
          5       0.94      0.88      0.91       381
          6       0.90      0.72      0.80       201

avg / total       0.90      0.90      0.90      4801


--------MNB-------

             precision    recall  f1-score   support

          1       0.61      0.98      0.75      1527
          2       0.89      0.67      0.76       806
          3       0.73      0.91      0.81      1135
          4       0.96      0.39      0.55       751
          5       0.90      0.07      0.14       381
          6       0.00      0.00      0.00       201

avg / total       0.74      0.71      0.66      4801




  'precision', 'predicted', average, warn_for)


In [10]:
print('--------SVM-------' + '\n')
print(metrics.confusion_matrix(label_dev, predicted_svm),'\n')
# print('--------MNB-------' + '\n')
# print(metrics.confusion_matrix(label_dev, predicted_mnb),'\n')

--------SVM-------

[[1443   14   39   30    0    1]
 [  29  702    7   63    1    4]
 [  37   14 1055   28    0    1]
 [  24   81   17  627    1    1]
 [   1    3   28    4  336    9]
 [  24    4    0    8   21  144]] 



<h3>Dot Product of Result Sets (Not sure if useful)</h3>

In [11]:
mnb = [int(i[1]) for i in output_mnb]
svm = [int(i[1]) for i in output_svm]
print(mnb[190:210])
print(svm[190:210])
np.inner(svm,mnb)

[1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 3, 5, 3, 3]
[4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 6, 5, 5, 5]


66951

<h3>Let's look at our mistakes</h3>

In [12]:
for i in range(0,len(predicted_svm)):
    if predicted_svm[i] != label_dev[i]:
        print(reviews_training[i],predicted_svm[i],label_dev[i])

"Leidan Mitchell is a great salon for anyone looking to get their hair cut or colored. Katie is honestly (and I mean SERIOUSLY) the ONLY person I really trust with cutting my hair. I have a TON of long, thick hair, and she knows exactly what to do with it. Katie listens, gives her feedback on what she thinks is best, and does a phenomenal job when cutting and coloring. I've had my hair cut by her a few times, a color once, and had it styled for my mothers wedding. Each and every time, I have been extremely pleased. 

The salon is very clean, which is a big deal in my opinion. Cut hair is always swept up promptly, salon assistants monitor the use/abandonment of used plates and glasses (they offer refreshments), and all of the other stylists and front desk girls are always really nice. If anyone is looking for a new salon or cut/color stylist, I definitely recommend Leidan Mitchell and Katie!"
7928 3 4
"Making an appointment was a breeze, too bad that I cannot say that the rest of my exp