In [11]:
import json
import sklearn.datasets
import sklearn.metrics
import sklearn.cross_validation
import sklearn.svm
import sklearn.naive_bayes
import sklearn.neighbors
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk.sentiment
import pickle



In [2]:
tokenizer = RegexpTokenizer(r'\w+')
en_stop = set(stopwords.words('english'))
p_stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
train_pickle = "lemmed_doc.pkl"
test_pickle = "lemmed_dev.pkl"

[nltk_data] Downloading package wordnet to /home/saket/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/saket/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
def measures(class_labels,predicted_class):
    print "accuracy",sklearn.metrics.accuracy_score(class_labels,predicted_class)
    print "confusion"
    confusion = sklearn.metrics.confusion_matrix(class_labels,predicted_class)
    print confusion
    print "macro f",sklearn.metrics.f1_score(class_labels,predicted_class,average='macro')
    print "micro f",sklearn.metrics.f1_score(class_labels,predicted_class,average='micro')
    print "weighted f",sklearn.metrics.f1_score(class_labels,predicted_class,average='weighted')

In [4]:
def cleaning(docs,adjective):
    new_docs = []
    for document in docs:
        raw = document.lower()
        raw = raw.replace("<br /><br />", " ")
        tokens = tokenizer.tokenize(raw)
        if adjective:
            pos = nltk.pos_tag(tokens)
            adj_list = [tag[0] for tag in pos if tag[1] == 'JJ']
        stopped_tokens = [token for token in tokens if token not in en_stop]
        stemmed_tokens = [p_stemmer.stem(token) for token in stopped_tokens]
        if adjective:
            stemmed_adj_tokens = [p_stemmer.stem(token) for token in adj_list]
            stemmed_tokens = stemmed_tokens + stemmed_adj_tokens
        documentWords = ' '.join(stemmed_tokens)
        new_docs.append(documentWords)
    return new_docs

In [5]:
def cleaning2(sample,file_name,adjective):
    file_name = '../dataset/' + file_name
    with open(file_name, 'rb') as f:
        lemmed_tokens = pickle.load(f)
    new_docs = []
    for i in range(min(sample,len(lemmed_tokens))):
        token_list = lemmed_tokens[i]
        if adjective:
            adj_list = [tag[0] for tag in token_list if tag[1] == 'JJ']
        stopped_tokens = [token[0] for token in token_list if token[0] not in en_stop]
        if adjective:
            stopped_tokens = stopped_tokens + adj_list
        documentWords = ' '.join(stopped_tokens)
        new_docs.append(documentWords)
    return new_docs

In [38]:
filepath = '../dataset/audio_train.json' 
documents = []
class_labels = []
summary = []

with open(filepath,'r') as fp:  
    line = fp.readline()
    while line:
        input_data = (json.loads(line))
        documents.append(input_data["reviewText"])
        summary.append(input_data["summary"])
        class_label = float(input_data["overall"])
        if class_label<=2:
            class_labels.append(-1)
        elif class_label==3:
            class_labels.append(0)
        else:
            class_labels.append(1)
        line = fp.readline()

In [7]:
dev_documents = []
dev_labels = []
filepath = '../dataset/audio_dev.json' 
dev_summary = []
with open(filepath,'r') as fp:  
    line = fp.readline()
    while line:
        input_data = (json.loads(line))
        class_label = float(input_data["overall"])
        dev_summary.append(input_data["summary"])
        if class_label<=2:
            dev_documents.append(input_data["reviewText"])
            dev_labels.append(-1)
        elif class_label==3:
            dev_documents.append(input_data["reviewText"])
            dev_labels.append(0)
        else:
            dev_documents.append(input_data["reviewText"])
            dev_labels.append(1)
        line = fp.readline()


In [8]:
def under_sample(sample_size,documents,labels,summary):
    counter_1 = 0
    counter_2 = 0
    counter_3 = 0
    undersampled_docs = []
    undersampled_labels = []
    undersampled_summary = []
    i = 0
    for i in range(len(documents)):
        if labels[i]==-1 and counter_1 < sample_size:
            undersampled_docs.append(documents[i])
            undersampled_labels.append(labels[i])
            undersampled_summary.append(summary[i])
            counter_1 += 1
        elif labels[i]==0 and counter_2 < sample_size:
            undersampled_docs.append(documents[i])
            undersampled_labels.append(labels[i])
            undersampled_summary.append(summary[i])
            counter_2 += 1
        elif labels[i]==1 and counter_3 < sample_size:
            undersampled_docs.append(documents[i])
            undersampled_labels.append(labels[i])
            undersampled_summary.append(summary[i])
            counter_3 += 1
        if counter_1 == sample_size and counter_2 == sample_size and counter_3 == sample_size:
            break
        i+=1
    return undersampled_docs,undersampled_labels,undersampled_summary

In [12]:
def negate(documents):
    new_documents = []
    for doc in documents:
        words = doc.split()
        new_words = nltk.sentiment.util.mark_negation(words, double_neg_flip=False, shallow=False)
        newdocument = ' '.join(new_words)
        new_documents.append(newdocument)
    return new_documents

In [39]:
# with open('../pickles/complete_train_300000_clean.pkl', 'rb') as f:
#     undersampled_docs1 = pickle.load(f)
# with open('../pickles/complete_train_800000_clean.pkl', 'rb') as f:
#     undersampled_docs2 = pickle.load(f)



# undersampled_docs = undersampled_docs1 + undersampled_docs2
undersampled_docs = documents[0:100000]
for i in range(len(undersampled_docs)):
    undersampled_docs[i] = undersampled_docs[i] + summary[i] + summary[i] + summary[i]
undersampled_docs = negate(undersampled_docs)
undersampled_labels = class_labels[0:100000]


In [None]:
# udev_documents,udev_labels,udev_summary = under_sample(4000,dev_documents,dev_labels,dev_summary)
# udev_documents = dev_documents[0:5000]
# with open('../pickles/complete_dev_clean.pkl', 'rb') as f:
#     udev_documents = pickle.load(f)
udev_documents = dev_documents
udev_documents = negate(udev_documents)
udev_labels = dev_labels
# udev_labels = dev_labels[0:5000]
# print len(undersampled_docs)

In [None]:
vect = sklearn.feature_extraction.text.TfidfVectorizer(ngram_range=(1,2),min_df=2,max_df=0.8,stop_words='english')
# vect.fit(undersampled_docs)
X_train_dtm = vect.fit_transform(undersampled_docs)

In [None]:
X_dev_dtm = vect.transform(udev_documents)

In [None]:
from sklearn import svm
clf = svm.LinearSVC(C=1,class_weight='balanced').fit(X_train_dtm, undersampled_labels)
# with open('../pickles/svm_74000_undersampled_1.pkl', 'wb') as f:
#     pickle.dump(clf,f)

In [None]:
predicted_class =clf.predict(X_train_dtm)

In [None]:
predicted_dev_class = clf.predict(X_dev_dtm)

In [None]:
print "without advectives"
print "svm"
print "train"
measures(undersampled_labels,predicted_class)
print "dev"
measures(udev_labels,predicted_dev_class)

In [313]:
clf_nb = sklearn.naive_bayes.MultinomialNB()
clf_nb.fit(X_train_dtm,undersampled_labels)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [314]:
predicted_class =clf_nb.predict(X_train_dtm)
predicted_dev_class = clf_nb.predict(X_dev_dtm)

In [315]:
print "nb"
print "train"
measures(undersampled_labels,predicted_class)
print "dev"
measures(dev_labels,predicted_dev_class)

nb
train
accuracy 0.874405405405
confusion
[[57177 15038  1785]
 [ 1387 71482  1131]
 [ 1173  7368 65459]]
macro f 0.875379861661
micro f 0.874405405405
weighted f 0.875379861661
dev
accuracy 0.640011297479
confusion
[[ 4651  4087   430]
 [  807  8162  1099]
 [ 3459 29630 57434]]
macro f 0.532337352164
micro f 0.640011297479
weighted f 0.705536832335
