In [8]:
import numpy as np
import gensim
from gensim.models.word2vec import Word2Vec
import os
import subprocess
import urllib
import ast
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

In [2]:
# Downloading datasets and pre-trained embeddings
dataset_name = 'qa_Appliances.json.gz'
url_dataset = 'http://jmcauley.ucsd.edu/data/amazon/qa'
if not os.path.exists(dataset_name):
    subprocess.run(['wget',  urllib.parse.urljoin(url_dataset, dataset_name)])
    subprocess.run(['gzip -d', dataset_name])
else:
    print('You already downloaded:', dataset_name)

embedding_name = 'GoogleNews-vectors-negative300.bin'
url_embed = 'https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing'
if not os.path.exists(embedding_name):
    subprocess.run(['wget', url_embed])
else:
    print('You already downloaded:', embedding_name)

You already downloaded: qa_Appliances.json.gz
You already downloaded: GoogleNews-vectors-negative300.bin


In [3]:
# load pre-trained word2vec model
googlenews_w2v = gensim.models.KeyedVectors.load_word2vec_format(embedding_name, binary=True)

In [4]:
# load datasets
# you cannot use json.load because this json file is invalid format...(using single quote)

dataset = dataset_name[:-3]  # remove .gz

questions = []
answers = []
with open(dataset, 'r') as f:
    for line in f:
        js = ast.literal_eval(line)
        questions.append(js['question'])
        answers.append(js['answer'])
print(len(questions))
print(len(answers))

9011
9011


In [5]:
# Split dataset into train and test set
qa_texts = np.array(questions + answers)
qa_labels = np.zeros(len(qa_texts), dtype=np.int)
qa_labels[len(questions):] = 1  # question: 0, answer: 1

qa_idx = np.random.permutation(len(qa_texts))
qa_texts = qa_texts[qa_idx]
qa_labels = qa_labels[qa_idx]

X_train, X_test, y_train, y_test = train_test_split(qa_texts, qa_labels)
print('Train:', len(X_train), 'Test:', len(X_test))

Train: 13516 Test: 4506


In [6]:
# Simple word embedding feature by averaging word vectors for all words in a text
# ref: http://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/#comment-3233012354
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec, dim):
        self.word2vec = word2vec
        self.dim = dim
        
    def fit(self, X, y):
        return self
    
    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

In [9]:
# Train a model
rf_amazon_qa = Pipeline([
    ('word2vec', MeanEmbeddingVectorizer(googlenews_w2v, googlenews_w2v.vector_size)), 
    ('randomforest', RandomForestClassifier(n_estimators=200))])
rf_amazon_qa.fit(X_train, y_train)

Pipeline(steps=[('word2vec', <__main__.MeanEmbeddingVectorizer object at 0x24c115c18>), ('randomforest', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=200, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))])

In [10]:
# Evaluation
y_pred = rf_amazon_qa.predict(X_test)
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.77      0.82      0.79      2280
          1       0.80      0.74      0.77      2226

avg / total       0.78      0.78      0.78      4506

