<h3>Make Training and Dev Data</h3>

In [43]:
import random

def csv_to_array(filename):
    file = open(filename, 'r')
    file_text = file.read()
    # Get rid of the first three, which are examples
    split_text = file_text.split('|')[3:]
    i = 0
    tuple_text = []
    while i < len(split_text) - 1:
        tuple_text.append((split_text[i+1], split_text[i], int(i/2)))
        i += 2
    
    return tuple_text
    
def make_training_and_dev_data(all_data):
    random.shuffle(all_data)
    training_ratio = .999
    return all_data[:int(len(all_data) * training_ratio)], all_data[int(len(all_data) * training_ratio):]
#     return all_data[:(len(all_data) * training_ratio)], all_data[:(len(all_data) * training_ratio)] 


official_data = csv_to_array('yelp_data_official_training.csv')

for garbage in official_data:
    if garbage[1].isdigit() is False:
        print(garbage[1])
        break

training_data, dev_data = make_training_and_dev_data(official_data)

In [44]:
def reviews(data):
    reviews = []
    for i in range(len(data)):
        reviews.append(data[i][0])
    return reviews

def labels(data):
    reviews = []
    for i in range(len(data)):
        reviews.append(int(data[i][1]))
    return reviews

reviews_training = reviews(training_data)
label_training = labels(training_data)
reviews_dev = reviews(dev_data)
label_dev = labels(dev_data)

In [45]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(reviews_training)
X_train_counts.shape

from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(47952, 95608)

<h3>Naive Bayes Pipeline</h3>

In [46]:
import numpy as np
from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import MultinomialNB

text_clf_mnb = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),])

text_clf_mnb = text_clf_mnb.fit(reviews_training, label_training)
predicted_mnb = text_clf_mnb.predict(reviews_dev)
np.mean(predicted_mnb == label_dev)

0.77551020408163263

<h3>SVM Pipeline</h3>

In [None]:
from sklearn.linear_model import SGDClassifier
svm_classifier = SGDClassifier(loss='hinge',
                               penalty='l2',
                               alpha=1e-3,
                               n_iter=10,
                               random_state=42)
text_clf_svm = Pipeline([('vect', CountVectorizer(ngram_range=(1, 3), max_df=0.95, min_df=2,stop_words='english')),
                     #('tfidf', TfidfTransformer()), 
                     ('clf', svm_classifier),])
text_clf_svm = text_clf_svm.fit(reviews_training, label_training)
predicted_svm = text_clf_svm.predict(reviews_dev)
np.mean(predicted_svm == label_dev)

<h3>Load Test Data</h3>

In [48]:
import re

def test_file_to_array(filename):
    file = open(filename, 'r')
    file_text = file.read()
    # Skip the first one because it's the header starter shit
    split_text = re.split('\d+\|', file_text)[1:]
    tupled_text = []
    for i in range(len(split_text)):
        tupled_text.append((split_text[i], i))        
    return tupled_text

test_array = test_file_to_array('yelp_data_official_test_nocategories.csv')

<h3>Make Test Predictions For All Models</h3>

In [49]:
# predicted_svm = text_clf_svm.predict(test_reviews)
# predicted_mnb = text_clf_mnb.predict(test_reviews)

def test_prediction(model):
    output = []
    test_reviews = []
    test_id = []
    
    for i in test_array:
        test_reviews.append(i[0])
        test_id.append(i[1])
        
    if str(model) == 'svm':
        predicted = text_clf_svm.predict(test_reviews)
        
    elif model == 'mnb':
        predicted = text_clf_mnb.predict(test_reviews)
        
    for i in range(0,len(predicted)):
        output.append((int(i),predicted[i]))
    
    return output

output_mnb = test_prediction('mnb')
output_svm = test_prediction('svm')

<h3>Write Output to CSV: Specify Model</h3>

In [50]:
def write_output_to_csv(output, filename):
    file = open(filename, 'w')
    file.write('Id' + ',' + 'Category' + '\n' )
    if str(output) == str(output_mnb):
        for item in output_mnb:
            file.write(str(item[0]) + ',' + str(item[1]) + '\n')
    elif str(output) == str(output_svm):
        for item in output_svm:
            file.write(str(item[0]) + ',' + str(item[1]) + '\n')

write_output_to_csv(output_svm, 'kegel_boys_5.csv')

<h3>Classification Report and Confustion Matrix</h3>

In [51]:
from sklearn import metrics
print('--------SVM-------' + '\n')
print(metrics.classification_report(label_dev, predicted_svm)+ '\n')
print('--------MNB-------' + '\n')
print(metrics.classification_report(label_dev, predicted_mnb)+ '\n')

--------SVM-------

             precision    recall  f1-score   support

          1       1.00      1.00      1.00        16
          2       0.80      0.80      0.80         5
          3       1.00      0.94      0.97        16
          4       0.80      0.89      0.84         9
          6       1.00      1.00      1.00         3

avg / total       0.94      0.94      0.94        49


--------MNB-------

             precision    recall  f1-score   support

          1       0.62      1.00      0.76        16
          2       0.75      0.60      0.67         5
          3       1.00      0.88      0.93        16
          4       1.00      0.56      0.71         9
          6       0.00      0.00      0.00         3

avg / total       0.79      0.78      0.75        49




  'precision', 'predicted', average, warn_for)


In [52]:
print('--------SVM-------' + '\n')
print(metrics.confusion_matrix(label_dev, predicted_svm),'\n')
# print('--------MNB-------' + '\n')
# print(metrics.confusion_matrix(label_dev, predicted_mnb),'\n')

--------SVM-------

[[16  0  0  0  0]
 [ 0  4  0  1  0]
 [ 0  0 15  1  0]
 [ 0  1  0  8  0]
 [ 0  0  0  0  3]] 



<h3>Dot Product of Result Sets (Not sure if useful)</h3>

In [53]:
mnb = [int(i[1]) for i in output_mnb]
svm = [int(i[1]) for i in output_svm]
print(mnb[190:210])
print(svm[190:210])
np.inner(svm,mnb)

[2, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 3, 5, 3, 3]
[4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 6, 5, 5, 5]


67057

<h3>Let's look at our mistakes</h3>

In [54]:
for i in range(0,len(predicted_svm)):
    if predicted_svm[i] != label_dev[i]:
        print(reviews_training[i],predicted_svm[i],label_dev[i])

"This dry cleaner may be the least professional, and poorly managed of any that I have ever been to. I have had atleast 3 bad experiences including poor/no pressing on dry cleaned items, items being returned without being cleaned, however, this last experience put a nail in the coffin. I took one of my favorite work dresses here to be routinely dry cleaned (they had cleaned it 3 previous times without a major issue), and it was returned to me COMPLETELY ruined. It had black ink ran all over the dress, even so much that it was on the tag they attached to it. I returned it to them, told that ""it happens sometimes"", and then told I would be compensated. It has been over a week, and still no call from the manager with a resolution, or time frame for reimbursement. Its clear that in this circumstance you get what you pay for!"
17726 4 2
"Everyone in my family has been going here for years and years, squeezing the last miles out of beloved shoes and boots. Every neighborhood ought to have 