# Bag of Words using TF-IDF

## Utilizou-se o sframe para ler os dados

In [1]:
import graphlab

## Carregamento dos dados

In [2]:
review = graphlab.SFrame.read_json('yelp_academic_dataset_review.json',orient='lines')

This non-commercial license of GraphLab Create for academic use is assigned to rodrigorcamo87@gmail.com and will expire on July 31, 2017.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: C:\Users\rodri\AppData\Local\Temp\graphlab_server_1475020466.log.0


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[dict]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


# Classificando o review como positivo ou negativo a partir da média de estrelas dadas por cada usuário

## Agrupando os reviews por usuário e calculando a média de estrelas

In [3]:
users = review.groupby(key_columns='user_id',operations={'avg_stars':graphlab.aggregate.AVG('stars')})

In [4]:
users = users.to_dataframe()

In [5]:
u = users.set_index('user_id').to_dict()

## Definindo o sentimento, se for maior ou igual a média do usuário, retorna 1. Do contrário, 0.

In [6]:
def define_sentiment(user_id,stars):
    if stars >= u['avg_stars'].get(user_id):
        return 1
    else:
        return 0

In [7]:
review['sentiment'] = review.apply(lambda x: define_sentiment(x['user_id'],x['stars']))

## Escolhendo apenas os reviews de Phoenix

In [8]:
business = graphlab.SFrame.read_json('yelp_academic_dataset_business.json',orient='lines')

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[dict]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [20]:
data = review.join(business,on='business_id')

In [21]:
data = data[data['city']=='Phoenix']

In [22]:
positive = data[data['sentiment']==1].sample(.1,seed=10)
negative = data[data['sentiment']==0].sample(.1,seed=10)
data = negative.append(positive)

In [12]:
print 'Quantidade de reviews positivos: '+ str(len(data[data['sentiment']==1]))
print 'Quantidade de reviews negativos: '+ str(len(data[data['sentiment']==0]))
print 'Tamanho do dataset             : '+ str(len(data))

Quantidade de reviews positivos: 20404
Quantidade de reviews negativos: 9926
Tamanho do dataset             : 30330


In [13]:
from __future__ import division
print len(positive)/len(data)

0.672733267392


In [14]:
data = graphlab.cross_validation.shuffle(data,random_seed=10)

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X = count_vect.fit_transform(data['text'])

In [16]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X)
X_train_tfidf.shape

(30330, 43376)

In [17]:
import numpy as np
target = np.asarray(data['sentiment'])

In [18]:
from sklearn.cross_validation import KFold
kf = KFold(len(data),10,shuffle=True)

In [19]:
from sklearn.svm import SVC
clf = SVC(kernel='linear')

In [21]:
k = 0
mean_accuracy = []
mean_f1_score = []
for train, val in kf:
    X_train = X_train_tfidf[train]
    y_train = target[train]
    X_val = X_train_tfidf[val]
    y_val =  target[val]
    clf.fit(X_train,y_train)
    predictions = clf.predict(X_val)
    y_val = graphlab.SArray(y_val)
    predictions = graphlab.SArray(predictions)
    accuracy = graphlab.evaluation.accuracy(y_val,predictions)
    f1_score = graphlab.evaluation.f1_score(y_val,predictions)
    mean_accuracy += [accuracy]
    mean_f1_score += [f1_score]
    print 'Fold {0}: Accuracy: {1} F-Score: {2}'.format(k,accuracy,f_score)
    k += 1
mean_accuracy = np.asarray(mean_accuracy)
mean_f1_score = np.asarray(mean_f1_score)

Fold 0: Accuracy: 0.791625453347 F-Score: 0.851083883129
Fold 1: Accuracy: 0.788658094296 F-Score: 0.851083883129
Fold 2: Accuracy: 0.773161885922 F-Score: 0.851083883129
Fold 3: Accuracy: 0.795911638642 F-Score: 0.851083883129
Fold 4: Accuracy: 0.787339268051 F-Score: 0.851083883129
Fold 5: Accuracy: 0.784701615562 F-Score: 0.851083883129
Fold 6: Accuracy: 0.77546983185 F-Score: 0.851083883129
Fold 7: Accuracy: 0.785031322123 F-Score: 0.851083883129
Fold 8: Accuracy: 0.773161885922 F-Score: 0.851083883129
Fold 9: Accuracy: 0.780415430267 F-Score: 0.851083883129


In [22]:
print 'Mean accuracy: {0}'.format(mean_accuracy.mean())
print 'Mean F-Score: {0}'.format(mean_f1_score.mean())

Mean accuracy: 0.783547642598
Mean F-Score: 0.845341606284


## Undersampling

In [26]:
perc = len(negative)/len(positive)
pos = positive.sample(perc,seed=10)
data_und = pos.append(negative)
print 'Quantidade de reviews positivos: '+ str(len(pos))
print 'Quantidade de reviews negativos: '+ str(len(negative))
print 'Tamanho do dataset             : '+ str(len(data_und))

Quantidade de reviews positivos: 10049
Quantidade de reviews negativos: 9926
Tamanho do dataset             : 19975


In [28]:
from __future__ import division
print len(pos)/len(data_und)

0.503078848561


In [29]:
data_und = graphlab.cross_validation.shuffle(data_und,random_seed=10)

In [30]:
count_vect = CountVectorizer()
X = count_vect.fit_transform(data_und['text'])

In [31]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X)
X_train_tfidf.shape

(19975, 36520)

In [32]:
target = np.asarray(data_und['sentiment'])

In [33]:
kf = KFold(len(data_und),10,shuffle=True)

In [35]:
k = 0
mean_accuracy = []
mean_f1_score = []
for train, val in kf:
    X_train = X_train_tfidf[train]
    y_train = target[train]
    X_val = X_train_tfidf[val]
    y_val =  target[val]
    clf.fit(X_train,y_train)
    predictions = clf.predict(X_val)
    y_val = graphlab.SArray(y_val)
    predictions = graphlab.SArray(predictions)
    accuracy = graphlab.evaluation.accuracy(y_val,predictions)
    f1_score = graphlab.evaluation.f1_score(y_val,predictions)
    mean_accuracy += [accuracy]
    mean_f1_score += [f1_score]
    print 'Fold {0}: Accuracy: {1} F-Score: {2}'.format(k,accuracy,f1_score)
    k += 1
mean_accuracy = np.asarray(mean_accuracy)
mean_f1_score = np.asarray(mean_f1_score)

Fold 0: Accuracy: 0.767767767768 F-Score: 0.770069375619
Fold 1: Accuracy: 0.766266266266 F-Score: 0.770515970516
Fold 2: Accuracy: 0.756256256256 F-Score: 0.757349277529
Fold 3: Accuracy: 0.754254254254 F-Score: 0.761997091614
Fold 4: Accuracy: 0.761261261261 F-Score: 0.761380690345
Fold 5: Accuracy: 0.765648472709 F-Score: 0.760245901639
Fold 6: Accuracy: 0.743114672008 F-Score: 0.741821841973
Fold 7: Accuracy: 0.762143214822 F-Score: 0.764735017335
Fold 8: Accuracy: 0.746119178768 F-Score: 0.745098039216
Fold 9: Accuracy: 0.755633450175 F-Score: 0.749486652977


In [36]:
print 'Mean accuracy: {0}'.format(mean_accuracy.mean())
print 'Mean F-Score: {0}'.format(mean_f1_score.mean())

Mean accuracy: 0.757846479429
Mean F-Score: 0.758269985876
