In [32]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

In [2]:
dataPos = pd.read_csv('dataset/pos_cleaned_25k_yelp.csv')
dataNeg = pd.read_csv('dataset/neg_cleaned_25k_yelp.csv')
dataAvg = pd.read_csv('dataset/avg_cleaned_25k_yelp.csv')
dataTag = pd.read_csv('dataset/data_with_tags.csv')

In [3]:
dataPos['label'] = (dataPos['useful'] > 0).astype(np.int64, copy = False)
dataNeg['label'] = (dataNeg['useful'] > 0).astype(np.int64, copy = False)
dataAvg['label'] = (dataAvg['useful'] > 0).astype(np.int64, copy = False)
dataTag['label'] = (dataTag['Useful'] > 0).astype(np.int64, copy = False)

In [None]:
dataTag.head()

In [None]:
dataPos.head()

In [4]:
print("useful positive reviews :",dataPos['label'].sum())
print("useful negative reviews :", dataNeg['label'].sum())
print("useful average revires :",dataAvg['label'].sum())
print("sum of individual useful :", dataPos['label'].sum() + dataNeg['label'].sum() + dataAvg['label'].sum() )
print("sum of master: ",dataTag['label'].sum())

useful positive reviews : 11161
useful negative reviews : 14297
useful average revires : 12313
sum of individual useful : 37771
sum of master:  37769


need to investigate above discrepancy. Appears as though three reviews have been lost

In [None]:
review1 = dataPos['text'][1]
review2 = dataPos['text'][2]
review3 = dataPos['text'][3]

In [None]:
print(review1 + '\n')
print(review2 + '\n')
print(review3 + '\n')

## Bag of words

In [5]:
count_vectPos = CountVectorizer()
X_train_countsPos = count_vectPos.fit_transform(dataPos['text'])
print(X_train_countsPos.shape)

(25000, 44369)


In [6]:
count_vectNeg = CountVectorizer()
X_train_countsNeg = count_vectNeg.fit_transform(dataNeg['text'])
print(X_train_countsNeg.shape)

(25000, 45940)


In [7]:
count_vectAvg = CountVectorizer()
X_train_countsAvg = count_vectAvg.fit_transform(dataAvg['text'])
print(X_train_countsAvg.shape)

(25000, 48904)


In [8]:
count_vectTag = CountVectorizer()
X_train_countsTag = count_vectTag.fit_transform(dataTag['Review'])
print(X_train_countsTag.shape)

(74997, 76137)


## tf

In [9]:
tf_transformerPos = TfidfTransformer(use_idf=False).fit(X_train_countsPos)
X_train_tfPos = tf_transformerPos.transform(X_train_countsPos)
print(X_train_tfPos.shape)

(25000, 44369)


In [10]:
tf_transformerNeg = TfidfTransformer(use_idf=False).fit(X_train_countsNeg)
X_train_tfNeg = tf_transformerNeg.transform(X_train_countsNeg)
print(X_train_tfNeg.shape)

(25000, 45940)


In [11]:
tf_transformerAvg = TfidfTransformer(use_idf=False).fit(X_train_countsAvg)
X_train_tfAvg = tf_transformerAvg.transform(X_train_countsAvg)
print(X_train_tfAvg.shape)

(25000, 48904)


In [12]:
tf_transformerTag = TfidfTransformer(use_idf=False).fit(X_train_countsTag)
X_train_tfTag = tf_transformerPos.transform(X_train_countsTag)
print(X_train_tfTag.shape)

(74997, 76137)


## tf-idf

In [13]:
tfidf_transformerPos = TfidfTransformer()
X_train_tfidfPos = tfidf_transformerPos.fit_transform(X_train_countsPos)
print(X_train_tfidfPos.shape)

(25000, 44369)


In [14]:
tfidf_transformerNeg = TfidfTransformer()
X_train_tfidfNeg = tfidf_transformerNeg.fit_transform(X_train_countsNeg)
print(X_train_tfidfNeg.shape)

(25000, 45940)


In [15]:
tfidf_transformerAvg = TfidfTransformer()
X_train_tfidfAvg = tfidf_transformerAvg.fit_transform(X_train_countsAvg)
print(X_train_tfidfAvg.shape)

(25000, 48904)


In [16]:
tfidf_transformerTag = TfidfTransformer()
X_train_tfidfTag = tfidf_transformerTag.fit_transform(X_train_countsTag)
print(X_train_tfidfTag.shape)

(74997, 76137)


## models

In [None]:
clf = MultinomialNB().fit(X_train_tfidf, dataTag['label'])

In [17]:
clfPos = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None).fit(X_train_tfidfPos, dataPos['label'])

In [18]:
clfNeg = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None).fit(X_train_tfidfNeg, dataNeg['label'])

In [19]:
clfAvg = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None).fit(X_train_tfidfAvg, dataAvg['label'])

In [20]:
clfTag = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None).fit(X_train_tfidfTag, dataTag['label'])

## predicitons


In [21]:
train_predictionsPos = clfPos.predict(X_train_tfidfPos)
train_predictionsNeg = clfNeg.predict(X_train_tfidfNeg)
train_predictionsAvg = clfAvg.predict(X_train_tfidfAvg)
train_predictionsTag = clfTag.predict(X_train_tfidfTag)

## accuracy

In [22]:
accuracyPos = accuracy_score(dataPos['label'], train_predictionsPos)
accuracyNeg = accuracy_score(dataNeg['label'], train_predictionsNeg)
accuracyAvg = accuracy_score(dataAvg['label'], train_predictionsAvg)
accuracyTag = accuracy_score(dataTag['label'], train_predictionsTag)

In [23]:
print("positive accuracy is :", accuracyPos)
print("nagative accuracy is :", accuracyNeg)
print("average accuracy is :", accuracyAvg)
print("tag accuracy is :", accuracyTag)

positive accuracy is : 0.6454
nagative accuracy is : 0.57744
average accuracy is : 0.63692
tag accuracy is : 0.6182113951224716


## confusion matrix

#top row is useless
#bottom row is useful
<br>   0 1
<br>0 TN FP
<br>1 FN TP

In [24]:
confusionMatrixPos = confusion_matrix(dataPos['label'], train_predictionsPos, labels=None, sample_weight=None, normalize=None)
confusionMatrixNormPos = confusion_matrix(dataPos['label'], train_predictionsPos, labels=None, sample_weight=None, normalize='true')
print(confusionMatrixPos)
print(confusionMatrixNormPos)

[[12024  1815]
 [ 7050  4111]]
[[0.86884891 0.13115109]
 [0.63166383 0.36833617]]


In [25]:
confusionMatrixNeg = confusion_matrix(dataNeg['label'], train_predictionsNeg, labels=None, sample_weight=None, normalize=None)
confusionMatrixNormNeg = confusion_matrix(dataNeg['label'], train_predictionsNeg, labels=None, sample_weight=None, normalize='true')
print(confusionMatrixNeg)
print(confusionMatrixNormNeg)

[[  187 10516]
 [   48 14249]]
[[0.01747174 0.98252826]
 [0.00335735 0.99664265]]


In [26]:
confusionMatrixAvg = confusion_matrix(dataAvg['label'], train_predictionsAvg, labels=None, sample_weight=None, normalize=None)
confusionMatrixNormAvg = confusion_matrix(dataAvg['label'], train_predictionsAvg, labels=None, sample_weight=None, normalize='true')
print(confusionMatrixAvg)
print(confusionMatrixNormAvg)

[[8164 4523]
 [4554 7759]]
[[0.64349334 0.35650666]
 [0.369853   0.630147  ]]


In [27]:
confusionMatrixTag = confusion_matrix(dataTag['label'], train_predictionsTag, labels=None, sample_weight=None, normalize=None)
confusionMatrixNormTag = confusion_matrix(dataTag['label'], train_predictionsTag, labels=None, sample_weight=None, normalize='true')
print(confusionMatrixTag)
print(confusionMatrixNormTag)

[[22174 15054]
 [13579 24190]]
[[0.59562695 0.40437305]
 [0.35952765 0.64047235]]


## split into test and training

In [35]:
def split_data(dataFrame, test_size = 0.1, random_state = 42):
    train_set, test_set = train_test_split(dataFrame, test_size = test_size, random_state = random_state)
    
    mtrain, ntrain = train_set.shape
    mtest, ntest = test_set.shape
    
    print('Number of instances in training set: %d\nNumber of instances in test set: %d\n' %(mtrain, mtest))

    return train_set, test_set

In [47]:
trainAVG, testAVG = split_data(dataAvg)

Number of instances in training set: 22500
Number of instances in test set: 2500



In [49]:
count_vectAvg2 = CountVectorizer()
X_train_countsAvg2 = count_vectAvg2.fit_transform(trainAVG['text'])
print(X_train_countsAvg2.shape)

(22500, 46478)


In [51]:
tf_transformerAvg2 = TfidfTransformer(use_idf=False).fit(X_train_countsAvg2)
X_train_tfAvg2 = tf_transformerAvg2.transform(X_train_countsAvg2)
print(X_train_tfAvg2.shape)

(22500, 46478)


In [54]:
tfidf_transformerAvg2 = TfidfTransformer().fit(X_train_countsAvg2)
X_train_tfidfAvg2 = tfidf_transformerAvg2.transform(X_train_countsAvg2)
print(X_train_tfidfAvg2.shape)

(22500, 46478)


In [56]:
clfAvg2 = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None).fit(X_train_tfidfAvg2, trainAVG['label'])

In [57]:
train_predictionsAvg2 = clfAvg2.predict(X_train_tfidfAvg2)

In [59]:
accuracyAvg2 = accuracy_score(trainAVG['label'], train_predictionsAvg2)
print("average accuracy is :", accuracyAvg2)

average accuracy is : 0.6368444444444444


In [60]:
confusionMatrixAvg2 = confusion_matrix(trainAVG['label'], train_predictionsAvg2, labels=None, sample_weight=None, normalize=None)
confusionMatrixNormAvg2 = confusion_matrix(trainAVG['label'], train_predictionsAvg2, labels=None, sample_weight=None, normalize='true')
print(confusionMatrixAvg2)
print(confusionMatrixNormAvg2)

[[7174 4247]
 [3924 7155]]
[[0.62814114 0.37185886]
 [0.35418359 0.64581641]]


## test

In [62]:
X_test_countsAvg2 = count_vectAvg2.transform(testAVG['text'])
print(X_test_countsAvg2.shape)

(2500, 46478)


In [63]:
X_test_tfidfAvg2 = tfidf_transformerAvg2.transform(X_test_countsAvg2)
print(X_test_tfidfAvg2.shape)

(2500, 46478)


In [64]:
test_predictionsAvg = clfAvg2.predict(X_test_tfidfAvg2)

In [66]:
testaccuracyAvg2 = accuracy_score(testAVG['label'], test_predictionsAvg)
print("average accuracy is :", testaccuracyAvg2)

average accuracy is : 0.6264


In [67]:
testconfusionMatrixAvg2 = confusion_matrix(testAVG['label'], test_predictionsAvg, labels=None, sample_weight=None, normalize=None)
testconfusionMatrixNormAvg2 = confusion_matrix(testAVG['label'], test_predictionsAvg, labels=None, sample_weight=None, normalize='true')
print(testconfusionMatrixAvg2)
print(testconfusionMatrixNormAvg2)

[[766 500]
 [434 800]]
[[0.60505529 0.39494471]
 [0.35170178 0.64829822]]
