In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
data_path = '../datasets/ubuntu_2.0/'
train_df = pd.read_csv(os.path.join(data_path, 'train.csv'))
valid_df = pd.read_csv(os.path.join(data_path, 'valid.csv'))
test_df = pd.read_csv(os.path.join(data_path, 'test.csv'))

In [3]:
train_df.head()

Unnamed: 0,Context,Utterance,Label
0,i think we could import the old comment via rs...,basic each xfree86 upload will not forc user t...,1
1,i 'm not suggest all - onli the one you modifi...,sorri __eou__ i think it be ubuntu relat . __e...,0
2,afternoon all __eou__ not entir relat to warti...,"yep . __eou__ oh , okay . i wonder what happen...",0
3,interest __eou__ grub-instal work with / be ex...,that the one __eou__,1
4,and becaus python give mark a woodi __eou__ __...,( i think someon be go to make a joke about .a...,1


In [4]:
valid_df.head()

Unnamed: 0,Context,Ground Truth Utterance,Distractor_0,Distractor_1,Distractor_2,Distractor_3,Distractor_4,Distractor_5,Distractor_6,Distractor_7,Distractor_8
0,ani idea on how lts will be releas ? __eou__ _...,we be talk 12.04 not 10.04 __eou__,you rememb my flash issu from yesterday or the...,"oh , no idea other be probabl ok __eou__ updat...","no , greenit be say his download speed be slow...",lsb_releas -sc __eou__ well ... regardless . i...,you can buy _anything_ in china __eou__,no __eou__,sudo restart lightdm __eou__,you be still ask for the uniti logout menu rig...,"so i be work as a linux admin intern , and my ..."
1,how much hdd use ubuntu default instal ? __eou...,that whi i ask how much be default instal ? : ...,all of this possibl in older version of ubuntu...,: be that a question ? __eou__,yes __eou__,"thank __eou__ i would imagin so , the site bon...",yes i ve investig that alreadi . it seem you c...,not realli . i use urxvt myself . __eou__,"thank a lot , realli ! __eou__","as someon els suggest , close update-manag , a...",you re welcom .. sinc 12.04 throw dnsmasq into...
2,in my countri it near the 27th __eou__ when wi...,thanx __eou__,"i have no .docx file , so do n't know , whi no...",i ve boot countless distro from usb on my aao ...,but i 'm sure i can work it out __eou__,"the way you put it , that sound like a sever c...",im not familiar with hotspot __eou__,it work fine without set up an ssh tunnel manu...,so it have two be a two-command process ? __eou__,"and becaus you onli have 3 gb of ram , be not ...",it ok but no error ? then how do you know it a...
3,it 's not out __eou__ __eot__ they probabali b...,wait for mani thing to be setup __eou__ final ...,"that 's right , while chat i regrett make a lo...",afaik it 's best to start at 2mb = 2048k __eou__,"for the most part , you should be instal pytho...",do you overwrit your win instal or can you bro...,for some reason the headphon option doe not ch...,well then i do n't know . can anyth boot on th...,well then i do n't know . can anyth boot on th...,"ya , but i guess you could do a git of your en...",noexec be a mount option . you would have to c...
4,be the ext4 driver stabl ? __eou__ __eot__ i b...,you sound like it 's updat to skynet . ; ) __e...,"ok i will tri that , brb __eou__ it complain a...",ouch __eou__,i do system annalysi and it say everyth pass 1...,not to mention way less complex ... you can ha...,"well , you can , accord to that articl , i als...","if not , i think you can pretti much grab ani ...","gpart ? i do n't want do edit partit , just mo...",i ve tri it . not a fan at all __eou__ i have ...,"ah , okay __eou__"


In [5]:
def evaluate_recall(y, y_test, k=1):
    """Implements the recall@k metric. This metric just checks whether 
       the label was one of the top_k predictions and counts it as correct.
    """
    num_correct = 0
    for predictions, label in zip(y, y_test):
        if label in predictions[:k]:
            num_correct += 1
    return num_correct / float(len(y_test))

In [6]:
# Sanity test of our evaluate_recall function.
# If it is correctly implemented then:
#    k = 1 : 10%
#    k = 2 : 20%
#    k = 3 : 30% 
#    ...........
y_random = [np.random.choice(10, 10, replace=False) for sample in range(len(test_df))]
y_test = np.zeros(len(test_df))
for k in [1, 2, 5, 10]:
    print('recall@{}: {:.4f}'.format(k, evaluate_recall(y_random, y_test, k=k)))

recall@1: 0.1019
recall@2: 0.2037
recall@5: 0.5012
recall@10: 1.0000


In [7]:
class TFIDFPredictor(object):
    """Define a Baseline TF-IDF Predictor using sklearn.
    """
    def __init__(self):
        self.vectorizer = TfidfVectorizer()
        
    def train(self, data):
        # Learn IDF features from data (X,Y)
        self.vectorizer.fit(np.append(data.Context.values, data.Utterance.values))
    
    def predict(self, context, utterances):
        # Now that we've learned TF-IDF from data, we can transform 
        # the documents to weights
        vector_context = self.vectorizer.transform([context])
        vector_doc = self.vectorizer.transform(utterances)
        
        # We take the dot-product between each utterance and our 
        # context, this yields our scores
        result = np.dot(vector_doc, vector_context.T).todense()
        result = np.asarray(result).flatten()
        
        return np.argsort(result, axis=0)[::-1]

In [8]:
# Train the TF-IDF Predictor
pred = TFIDFPredictor()
pred.train(train_df)

In [9]:
# Evaluate and print out results
y = [pred.predict(test_df.Context[i], test_df.iloc[i,1:].values) for i in range(len(test_df))]

for k in [1,2,5,10]:
    print('recall@{}: {:.4f}'.format(k, evaluate_recall(y, y_test, k=k)))

recall@1: 0.4950
recall@2: 0.5969
recall@5: 0.7661
recall@10: 1.0000
