In [230]:
import json
import sklearn.datasets
import sklearn.metrics
import sklearn.cross_validation
import sklearn.svm
import sklearn.naive_bayes
import sklearn.neighbors
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

In [231]:
tokenizer = RegexpTokenizer(r'\w+')
en_stop = set(stopwords.words('english'))
p_stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /home/saket/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/saket/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [232]:
def measures(class_labels,predicted_class):
    print "accuracy",sklearn.metrics.accuracy_score(class_labels,predicted_class)
    print "confusion"
    confusion = sklearn.metrics.confusion_matrix(class_labels,predicted_class)
    print confusion
    print "macro f",f1_score(class_labels,predicted_class,average='macro')
    print "micro f",f1_score(class_labels,predicted_class,average='micro')
    print "weighted f",f1_score(class_labels,predicted_class,average='weighted')

In [233]:
def cleaning(docs,adjective):
    new_docs = []
    for document in docs:
        raw = document.lower()
        raw = raw.replace("<br /><br />", " ")
        tokens = tokenizer.tokenize(raw)
        if adjective:
            pos = nltk.pos_tag(tokens)
            adj_list = [tag[0] for tag in pos if tag[1] == 'JJ']
        stopped_tokens = [token for token in tokens if token not in en_stop]
        lemmed_tokens = [wordnet_lemmatizer.lemmatize(token) for token in stopped_tokens]
        stemmed_tokens = [p_stemmer.stem(token) for token in lemmed_tokens]
        if adjective:
            lemmed_adj_tokens = [wordnet_lemmatizer.lemmatize(token) for token in adj_list]
            stemmed_adj_tokens = [p_stemmer.stem(token) for token in lemmed_adj_tokens]
            stemmed_tokens = stemmed_tokens + stemmed_adj_tokens
        documentWords = ' '.join(stemmed_tokens)
        new_docs.append(documentWords)
    return new_docs

In [234]:
filepath = '../dataset/audio_train.json' 
documents = []
class_labels = []

documents_1 = []
class_labels_1 = []
documents_2 = []
class_labels_2 = []
documents_3 = []
class_labels_3 = []
summary_1 =[]
summary_2 =[]
summary_3 =[]

with open(filepath,'r') as fp:  
    line = fp.readline()
    while line:
        input_data = (json.loads(line))
        documents.append(input_data["reviewText"])
        class_label = float(input_data["overall"])
        if class_label<=2:
            class_labels.append(-1)
            documents_1.append(input_data["reviewText"])
            summary_1.append(input_data["summary"])
            class_labels_1.append(-1)
        elif class_label==3:
            class_labels.append(0)
            documents_2.append(input_data["reviewText"])
            summary_2.append(input_data["summary"])
            class_labels_2.append(0)
        else:
            class_labels.append(1)
            documents_3.append(input_data["reviewText"])
            summary_3.append(input_data["summary"]) 
            class_labels_3.append(1)
        line = fp.readline()

In [248]:
count_thresh = 5000
count_1 = 0
count_2 = 0
count_3 = 0
dev_documents = []
dev_labels = []
filepath = '../dataset/audio_dev.json' 
with open(filepath,'r') as fp:  
    line = fp.readline()
    while line:
        input_data = (json.loads(line))
        class_label = float(input_data["overall"])
        if class_label<=2 and count_1<count_thresh:
#         if class_label<=2:
            dev_documents.append(input_data["reviewText"])
            count_1 += 1
            dev_labels.append(-1)
        elif class_label==3 and count_2<count_thresh:
#         elif class_label==3:
            dev_documents.append(input_data["reviewText"])
            count_2 += 1
            dev_labels.append(0)
        elif count_3<count_thresh:
#         else:
            dev_documents.append(input_data["reviewText"])
            count_3 += 1
            dev_labels.append(1)
        line = fp.readline()


In [261]:
samples = 10000
sum_doc1 = []
sum_doc2 = []
sum_doc3 = []

for i in range(samples):
    sum_doc1.append(documents_1[i] + summary_1[i] + summary_1[i])
for i in range(samples):
    sum_doc2.append(documents_2[i] + summary_2[i] + summary_2[i])
for i in range(samples):
    sum_doc3.append(documents_3[i] + summary_3[i] + summary_3[i])

undersampled_docs = sum_doc1[0:samples] + sum_doc2[0:samples] + sum_doc3[0:samples]
undersampled_labels = class_labels_1[0:samples] + class_labels_2[0:samples] + class_labels_3[0:samples]

# new_docs = cleaning(undersampled_docs,0)

In [237]:
# dev_documents = cleaning(dev_documents,0)

In [268]:
vect = sklearn.feature_extraction.text.CountVectorizer()
vect.fit(undersampled_docs)
X_train_dtm = vect.transform(undersampled_docs)

In [269]:
vect_dev = sklearn.feature_extraction.text.CountVectorizer()
vect_dev.fit(dev_documents)
X_dev_dtm = vect.transform(dev_documents)

In [270]:
from sklearn import svm
clf = svm.LinearSVC().fit(X_train_dtm, undersampled_labels)

In [271]:
predicted_class =clf.predict(X_train_dtm)

In [272]:
predicted_dev_class = clf.predict(X_dev_dtm)

In [273]:
print "without advectives"
print "svm"
print "train"
measures(undersampled_labels,predicted_class)
print "dev"
measures(dev_labels,predicted_dev_class)

without advectives
svm
train
accuracy 0.97430952381
confusion
[[68502  1213   285]
 [ 1771 66973  1256]
 [  203   667 69130]]
macro f 0.974268406703
micro f 0.97430952381
weighted f 0.974268406703
dev
accuracy 0.636666666667
confusion
[[3428 1036  536]
 [1383 2409 1208]
 [ 461  826 3713]]
macro f 0.632425611903
micro f 0.636666666667
weighted f 0.632425611903


In [274]:
clf_nb = sklearn.naive_bayes.MultinomialNB()
clf_nb.fit(X_train_dtm,undersampled_labels)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [275]:
predicted_class =clf_nb.predict(X_train_dtm)
predicted_dev_class = clf_nb.predict(X_dev_dtm)

In [276]:
print "nb"
print "train"
measures(undersampled_labels,predicted_class)
print "dev"
measures(dev_labels,predicted_dev_class)

nb
train
accuracy 0.755142857143
confusion
[[54954 12330  2716]
 [12843 49884  7273]
 [ 7090  9168 53742]]
macro f 0.755990971192
micro f 0.755142857143
weighted f 0.755990971192
dev
accuracy 0.646333333333
confusion
[[3439 1266  295]
 [1196 2927  877]
 [ 729  942 3329]]
macro f 0.647338029713
micro f 0.646333333333
weighted f 0.647338029713
