# Bag of Words using TF-IDF

## Utilizou-se o sframe para ler os dados

In [1]:
import graphlab

## Carregamento dos dados

In [2]:
review = graphlab.SFrame.read_json('yelp_academic_dataset_review.json',orient='lines')

This non-commercial license of GraphLab Create for academic use is assigned to rodrigorcamo87@gmail.com and will expire on July 31, 2017.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: C:\Users\rodri\AppData\Local\Temp\graphlab_server_1474420839.log.0


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[dict]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


# Classificando o review como positivo ou negativo a partir da média de estrelas dadas por cada usuário

## Agrupando os reviews por usuário e calculando a média de estrelas

In [3]:
users = review.groupby(key_columns='user_id',operations={'avg_stars':graphlab.aggregate.AVG('stars')})

In [4]:
users = users.to_dataframe()

In [5]:
u = users.set_index('user_id').to_dict()

## Definindo o sentimento, se for maior ou igual a média do usuário, retorna 1. Do contrário, 0.

In [6]:
def define_sentiment(user_id,stars):
    if stars >= u['avg_stars'].get(user_id):
        return 1
    else:
        return 0

In [7]:
review['sentiment'] = review.apply(lambda x: define_sentiment(x['user_id'],x['stars']))

## Escolhendo apenas os reviews de Phoenix

In [8]:
business = graphlab.SFrame.read_json('yelp_academic_dataset_business.json',orient='lines')

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[dict]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [25]:
data = review.join(business,on='business_id')

In [26]:
data = data[data['city']=='Phoenix']

In [27]:
positive = data[data['sentiment']==1].sample(.1,seed=10)
negative = data[data['sentiment']==0].sample(.1,seed=10)
data = negative.append(positive)

In [28]:
print 'Quantidade de reviews positivos: '+ str(len(data[data['sentiment']==1]))
print 'Quantidade de reviews negativos: '+ str(len(data[data['sentiment']==0]))
print 'Tamanho do dataset             : '+ str(len(data))

Quantidade de reviews positivos: 20414
Quantidade de reviews negativos: 9915
Tamanho do dataset             : 30329


In [29]:
from __future__ import division
print len(positive)/len(data)

0.673085166013


In [30]:
data = graphlab.cross_validation.shuffle(data,random_seed=10)

In [31]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X = count_vect.fit_transform(data['text'])

In [32]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X)
X_train_tfidf.shape

(30329, 43218)

In [33]:
import numpy as np
target = np.asarray(data['sentiment'])

In [34]:
from sklearn.cross_validation import KFold
kf = KFold(len(data),10,shuffle=True)

In [35]:
from sklearn.svm import SVC
clf = SVC(kernel='linear')

In [36]:
k = 0
results = []
for train, val in kf:
    X_train = X_train_tfidf[train]
    y_train = target[train]
    X_val = X_train_tfidf[val]
    y_val =  target[val]
    clf.fit(X_train,y_train)
    predictions = clf.predict(X_val)
    y_val = graphlab.SArray(y_val)
    predictions = graphlab.SArray(predictions)
    accuracy = graphlab.evaluation.accuracy(y_val,predictions)
    results += [accuracy]
    print 'Fold {0}: Accuracy {1}'.format(k,accuracy)
    k += 1
results = np.asarray(results)

Fold 0: Accuracy 0.78404220244
Fold 1: Accuracy 0.793933399275
Fold 2: Accuracy 0.769205407188
Fold 3: Accuracy 0.785361028684
Fold 4: Accuracy 0.780415430267
Fold 5: Accuracy 0.785690735246
Fold 6: Accuracy 0.780085723706
Fold 7: Accuracy 0.783712495879
Fold 8: Accuracy 0.782723376195
Fold 9: Accuracy 0.78364116095


In [37]:
print 'Mean accuracy: {0}'.format(results.mean())

Mean accuracy: 0.782881095983
