<h3>Make Training and Dev Data</h3>

In [55]:
import random

def csv_to_array(filename):
    file = open(filename, 'r')
    file_text = file.read()
    # Get rid of the first three, which are examples
    split_text = file_text.split('|')[3:]
    i = 0
    tuple_text = []
    while i < len(split_text) - 1:
        tuple_text.append((split_text[i+1], split_text[i], int(i/2)))
        i += 2
    
    return tuple_text
    
def make_training_and_dev_data(all_data):
    random.shuffle(all_data)
    # Make this number ridiculously high for out test
    training_ratio = .999
    return all_data[:int(len(all_data) * training_ratio)], all_data[int(len(all_data) * training_ratio):]


official_data = csv_to_array('yelp_data_official_training.csv')

for garbage in official_data:
    if garbage[1].isdigit() is False:
        print(garbage[1])
        break

training_data, dev_data = make_training_and_dev_data(official_data)

In [56]:
def reviews(data):
    reviews = []
    for i in range(len(data)):
        reviews.append(data[i][0])
    return reviews

def labels(data):
    reviews = []
    for i in range(len(data)):
        reviews.append(int(data[i][1]))
    return reviews

reviews_training = reviews(training_data)
label_training = labels(training_data)
reviews_dev = reviews(dev_data)
label_dev = labels(dev_data)

In [57]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(reviews_training)
X_train_counts.shape

from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(47952, 95578)

<h3>Naive Bayes Pipeline</h3>

In [58]:
import numpy as np
from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import MultinomialNB

text_clf_mnb = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),])

text_clf_mnb = text_clf_mnb.fit(reviews_training, label_training)
predicted_mnb = text_clf_mnb.predict(reviews_dev)
np.mean(predicted_mnb == label_dev)

0.7142857142857143

<h3>SVM Pipeline</h3>

In [67]:
# Thanks to the Sklearn documentation for helping us with a lot of this code
# http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

from sklearn.linear_model import SGDClassifier
svm_classifier = SGDClassifier(loss='hinge',
                               penalty='l2',
                               alpha=1e-3,
                               n_iter=10,
                               random_state=42)
text_clf_svm = Pipeline([('vect', CountVectorizer(ngram_range=(1, 3), max_df=0.9, binary=True, min_df=2, stop_words='english')),
#                      ('tfidf', TfidfTransformer()), 
                     ('clf', svm_classifier),])
text_clf_svm = text_clf_svm.fit(reviews_training, label_training)
predicted_svm = text_clf_svm.predict(reviews_dev)
np.mean(predicted_svm == label_dev)

0.81632653061224492

<h3>Load Test Data</h3>

In [60]:
import re

def test_file_to_array(filename):
    file = open(filename, 'r')
    file_text = file.read()
    # Skip the first one because it's the header starter garbage
    split_text = re.split('\d+\|', file_text)[1:]
    tupled_text = []
    for i in range(len(split_text)):
        tupled_text.append((split_text[i], i))        
    return tupled_text

test_array = test_file_to_array('yelp_data_official_test_nocategories.csv')

<h3>Make Test Predictions For All Models</h3>

In [61]:
def test_prediction(model):
    output = []
    test_reviews = []
    test_id = []
    
    for i in test_array:
        test_reviews.append(i[0])
        test_id.append(i[1])
        
    if str(model) == 'svm':
        predicted = text_clf_svm.predict(test_reviews)
        
    elif model == 'mnb':
        predicted = text_clf_mnb.predict(test_reviews)
        
    for i in range(0,len(predicted)):
        output.append((int(i),predicted[i]))
    
    return output

output_mnb = test_prediction('mnb')
output_svm = test_prediction('svm')

<h3>Write Output to CSV: Specify Model</h3>

In [62]:
def write_output_to_csv(output, filename):
    file = open(filename, 'w')
    file.write('Id' + ',' + 'Category' + '\n' )
    if str(output) == str(output_mnb):
        for item in output_mnb:
            file.write(str(item[0]) + ',' + str(item[1]) + '\n')
    elif str(output) == str(output_svm):
        for item in output_svm:
            file.write(str(item[0]) + ',' + str(item[1]) + '\n')

write_output_to_csv(output_svm, 'kegel_boys_7.csv')

<h3>Classification Report and Confustion Matrix</h3>

In [63]:
from sklearn import metrics
print('--------SVM-------' + '\n')
print(metrics.classification_report(label_dev, predicted_svm)+ '\n')
print('--------MNB-------' + '\n')
print(metrics.classification_report(label_dev, predicted_mnb)+ '\n')

--------SVM-------

             precision    recall  f1-score   support

          1       1.00      0.88      0.94        17
          2       0.89      0.89      0.89         9
          3       0.85      1.00      0.92        11
          4       0.80      1.00      0.89         4
          5       0.83      0.83      0.83         6
          6       1.00      0.50      0.67         2

avg / total       0.91      0.90      0.90        49


--------MNB-------

             precision    recall  f1-score   support

          1       0.67      0.94      0.78        17
          2       1.00      0.67      0.80         9
          3       0.62      0.91      0.74        11
          4       1.00      0.75      0.86         4
          5       0.00      0.00      0.00         6
          6       0.00      0.00      0.00         2

avg / total       0.64      0.71      0.65        49




  'precision', 'predicted', average, warn_for)


In [64]:
print('--------SVM-------' + '\n')
print(metrics.confusion_matrix(label_dev, predicted_svm),'\n')
# print('--------MNB-------' + '\n')
# print(metrics.confusion_matrix(label_dev, predicted_mnb),'\n')

--------SVM-------

[[15  1  1  0  0  0]
 [ 0  8  0  0  1  0]
 [ 0  0 11  0  0  0]
 [ 0  0  0  4  0  0]
 [ 0  0  1  0  5  0]
 [ 0  0  0  1  0  1]] 



<h3>Dot Product of Result Sets (Not sure if useful)</h3>

In [65]:
mnb = [int(i[1]) for i in output_mnb]
svm = [int(i[1]) for i in output_svm]
print(mnb[190:210])
print(svm[190:210])
np.inner(svm,mnb)

[1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 3, 5, 3, 3]
[4, 4, 3, 1, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 6, 5, 5, 5]


67216

<h3>Let's look at our mistakes</h3>

In [66]:
for i in range(0,len(predicted_svm)):
    if predicted_svm[i] != label_dev[i]:
        print(reviews_training[i],predicted_svm[i],label_dev[i])

Regis Salon will give you a very personable and exceptional experience. Everyone will remember you and will always serve to give you the best hair cut, style, and color. They also wax eyebrows, and actually listen to you. You say not too thin, they won't do it thin. You say only cut a trim, they only cut a trim. I'm picky with my hair, and only trust a few people, everyone here I can trust although I do personally love Giti here.
47464 4 6
Mike did a fantastic job of helping me find everything I needed to care for my newly remodeled pebble tech pool. He was courteous and I enjoyed working with him. He guided me and best buys in the store, while maintaining supplies with great quality. I have always and will always continue to patronize their store.
7258 5 2
When you need the best gates, security doors, artistic iron decor or cool ideas, stop here!!! These guys are AWESOME!!! Good family business!!! They welcomed and helped us when we moved in to the neighborhood!
34226 3 5
"I've read s