<h3>Make Training and Dev Data</h3>

In [1]:
import random

def csv_to_array(filename):
    file = open(filename, 'r')
    file_text = file.read()
    # Get rid of the first three, which are examples
    split_text = file_text.split('|')[3:]
    i = 0
    tuple_text = []
    while i < len(split_text) - 1:
        tuple_text.append((split_text[i+1], split_text[i], int(i/2)))
        i += 2
    
    return tuple_text
    
def make_training_and_dev_data(all_data):
    random.shuffle(all_data)
    training_ratio = .9
    return all_data[:int(len(all_data) * training_ratio)], all_data[int(len(all_data) * training_ratio):]
#     return all_data[:(len(all_data) * training_ratio)], all_data[:(len(all_data) * training_ratio)] 


official_data = csv_to_array('yelp_data_official_training.csv')

for garbage in official_data:
    if garbage[1].isdigit() is False:
        print(garbage[1])
        break

training_data, dev_data = make_training_and_dev_data(official_data)

In [2]:
def reviews(data):
    reviews = []
    for i in range(len(training_data)):
        reviews.append(training_data[i][0])
    return reviews

def labels(data):
    reviews = []
    for i in range(len(training_data)):
        reviews.append(int(training_data[i][1]))
    return reviews

reviews_training = reviews(training_data)
label_training = labels(training_data)
reviews_dev = reviews(dev_data)
label_dev = labels(dev_data)

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(reviews_training)
X_train_counts.shape

from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

  if 'order' in inspect.getargspec(np.copy)[0]:


(43200, 88888)

<h3>Naive Bayes Pipeline</h3>

In [4]:
import numpy as np
from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import MultinomialNB

text_clf_mnb = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),])

text_clf_mnb = text_clf_mnb.fit(reviews_training, label_training)
predicted_mnb = text_clf_mnb.predict(reviews_dev)
np.mean(predicted_mnb == label_dev)

0.74337962962962967

<h3>SVM Pipeline</h3>

In [5]:
from sklearn.linear_model import SGDClassifier

text_clf_svm = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2,stop_words='english')),
                     #('tfidf', TfidfTransformer()), 
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42)),])
text_clf_svm = text_clf_svm.fit(reviews_training, label_training)
predicted_svm = text_clf_svm.predict(reviews_dev)
np.mean(predicted_svm == label_dev)

0.93895833333333334

<h3>Load Test Data</h3>

In [6]:
import re

def test_file_to_array(filename):
    file = open(filename, 'r')
    file_text = file.read()
    # Skip the first one because it's the header starter shit
    split_text = re.split('\d+\|', file_text)[1:]
    tupled_text = []
    for i in range(len(split_text)):
        tupled_text.append((split_text[i], i))        
    return tupled_text

test_array = test_file_to_array('yelp_data_official_test_nocategories.csv')

<h3>Make Test Predictions For All Models</h3>

In [7]:
# predicted_svm = text_clf_svm.predict(test_reviews)
# predicted_mnb = text_clf_mnb.predict(test_reviews)

def test_prediction(model):
    output = []
    test_reviews = []
    test_id = []
    
    for i in test_array:
        test_reviews.append(i[0])
        test_id.append(i[1])
        
    if str(model) == 'svm':
        predicted = text_clf_svm.predict(test_reviews)
        
    elif model == 'mnb':
        predicted = text_clf_mnb.predict(test_reviews)
        
    for i in range(0,len(predicted)):
        output.append((int(i),predicted[i]))
    
    return output

output_mnb = test_prediction('mnb')
output_svm = test_prediction('svm')

<h3>Write Output to CSV: Specify Model</h3>

In [8]:
def write_output_to_csv(output, filename):
    file = open(filename, 'w')
    file.write('Id' + ',' + 'Category' + '\n' )
    if str(output) == str(output_mnb):
        for item in output_mnb:
            file.write(str(item[0]) + ',' + str(item[1]) + '\n')
    elif str(output) == str(output_svm):
        for item in output_svm:
            file.write(str(item[0]) + ',' + str(item[1]) + '\n')

write_output_to_csv(output_svm, 'kuvinka_test.csv')

<h3>Classification Report and Confustion Matrix</h3>

In [9]:
from sklearn import metrics
print('--------SVM-------' + '\n')
print(metrics.classification_report(label_training, predicted_svm)+ '\n')
print('--------MNB-------' + '\n')
print(metrics.classification_report(label_training, predicted_mnb)+ '\n')

--------SVM-------

             precision    recall  f1-score   support

          1       0.96      0.96      0.96     13558
          2       0.92      0.93      0.92      7188
          3       0.94      0.97      0.95     10218
          4       0.92      0.90      0.91      7043
          5       0.96      0.92      0.94      3340
          6       0.94      0.86      0.90      1853

avg / total       0.94      0.94      0.94     43200


--------MNB-------

             precision    recall  f1-score   support

          1       0.65      0.98      0.78     13558
          2       0.91      0.74      0.82      7188
          3       0.75      0.94      0.84     10218
          4       0.98      0.49      0.65      7043
          5       0.93      0.11      0.20      3340
          6       1.00      0.00      0.01      1853

avg / total       0.81      0.74      0.70     43200




In [10]:
print('--------SVM-------' + '\n')
print(metrics.confusion_matrix(label_training, predicted_svm),'\n')
print('--------MNB-------' + '\n')
print(metrics.confusion_matrix(label_training, predicted_mnb),'\n')

--------SVM-------

[[13004   100   309   132     3    10]
 [  117  6687    64   309     2     9]
 [  162    53  9896    92     8     7]
 [  159   415   127  6327     2    13]
 [   32    16   159    11  3057    65]
 [   80    25    17    39   100  1592]] 

--------MNB-------

[[13300     5   253     0     0     0]
 [ 1634  5330   166    58     0     0]
 [  550    10  9651     7     0     0]
 [ 2759   458   369  3457     0     0]
 [  664     4  2304     1   367     0]
 [ 1668    24   116    10    26     9]] 



<h3>Dot Product of Result Sets (Not sure if useful)</h3>

In [11]:
mnb = [int(i[1]) for i in output_mnb]
svm = [int(i[1]) for i in output_svm]
print(mnb[190:210])
print(svm[190:210])
np.inner(svm,mnb)

[2, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 3, 5, 3, 3]
[4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 5, 5, 5, 5]


67159