In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
data_path = '../datasets/ubuntu_1.0/'
train_df = pd.read_csv(os.path.join(data_path, 'trainset.csv'))
test_df = pd.read_csv(os.path.join(data_path, 'testset.csv'))

In [3]:
def evaluate_recall(y, y_test, k=1):
    """Implements the recall@k metric. This metric just checks whether 
       the label was one of the top_k predictions and counts it as correct.
    """
    num_correct = 0
    for predictions, label in zip(y, y_test):
        if label in predictions[:k]:
            num_correct += 1
    return num_correct / float(len(y_test))

In [4]:
# Sanity test of our evaluate_recall function.
# If it is correctly implemented then:
#    k = 1 : 10%
#    k = 2 : 20%
#    k = 3 : 30% 
#    ...........
y_random = [np.random.choice(10, 10, replace=False) for sample in range(len(test_df))]
y_test = np.zeros(len(test_df))
for k in [1, 2, 5, 10]:
    print('recall@{}: {:.4f}'.format(k, evaluate_recall(y_random, y_test, k=k)))

recall@1: 0.1003
recall@2: 0.2001
recall@5: 0.5010
recall@10: 1.0000


In [5]:
class TFIDFPredictor(object):
    """Define a Baseline TF-IDF Predictor using sklearn.
    """
    def __init__(self):
        self.vectorizer = TfidfVectorizer()
        
    def train(self, data):
        # Learn IDF features from data (X,Y)
        self.vectorizer.fit(data.iloc[:,0].values, data.iloc[:,1].values)
    
    def predict(self, context, utterances):
        # Now that we've learned TF-IDF from data, we can transform 
        # the documents to weights
        vector_context = self.vectorizer.transform([context])
        vector_doc = self.vectorizer.transform(utterances)
        
        # We take the dot-product between each utterance and our 
        # context, this yields our scores
        result = np.dot(vector_doc, vector_context.T).todense()
        result = np.asarray(result).flatten()
        
        return np.argsort(result, axis=0)[::-1]

In [6]:
# Train the TF-IDF Predictor
pred = TFIDFPredictor()
pred.train(train_df)

In [18]:
val_df = pd.read_csv(os.path.join(data_path, 'valset.csv'))

In [8]:
# Evaluate and print out results
y = [pred.predict(test_df.iloc[i,0].values, test_df.iloc[i,1:].values) for i in range(len(test_df))]

for k in [1,2,5,10]:
    print('recall@{}: {:.4f}'.format(k, evaluate_recall(y, y_test, k=k)))

AttributeError: 'str' object has no attribute 'values'