In [92]:
import datetime
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

In [28]:
travel = pd.read_csv('data/travel.csv.zip', index_col='id')
print(travel.shape)
travel.head()

(19279, 3)


Unnamed: 0_level_0,title,content,tags
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,What are some Caribbean cruises for October?,<p>My fiancée and I are looking for a good Car...,caribbean cruising vacations
2,How can I find a guide that will take me safel...,"<p>This was one of our definition questions, b...",guides extreme-tourism amazon-river amazon-jungle
4,Does Singapore Airlines offer any reward seats...,<p>Singapore Airlines has an all-business clas...,loyalty-programs routes ewr singapore-airlines...
5,What is the easiest transportation to use thro...,<p>Another definition question that interested...,romania transportation
6,How can I visit Antarctica?,"<p>A year ago I was reading some magazine, and...",extreme-tourism antarctica


In [3]:
def prepare_text(text):
    cleantext = re.sub("<.*?>", "", text).lower()
    splitter = re.compile("[^a-zA-Z0-9_\\+\\-/]")
    words = splitter.split(cleantext)
    stops = set(stopwords.words("english"))
    meaningful_words = [w.strip() for w in words if not w in stops]
    return " ".join(filter(None, meaningful_words))

In [4]:
def prepare_data(df):
    travel['content'] = travel['content'].apply(prepare_text)
    travel['title'] = travel['title'].apply(prepare_text)

In [5]:
def prepare_tags(df):
    tags = set()
    df['tags'].str.split(' ').apply(tags.update)
    return tags

In [6]:
len(prepare_tags(travel))

1645

In [29]:
prepare_data(travel)

In [30]:
travel.head()

Unnamed: 0_level_0,title,content,tags
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,caribbean cruises october,fianc e looking good caribbean cruise october ...,caribbean cruising vacations
2,find guide take safely amazon jungle,one definition questions also one interests pe...,guides extreme-tourism amazon-river amazon-jungle
4,singapore airlines offer reward seats ewr-sin ...,singapore airlines all-business class flight e...,loyalty-programs routes ewr singapore-airlines...
5,easiest transportation use throughout romania ...,another definition question interested easiest...,romania transportation
6,visit antarctica,year ago reading magazine found availability g...,extreme-tourism antarctica


In [24]:
vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, max_features = 5000)
train_data_features = vectorizer.fit_transform(travel['content'])

In [53]:
counts = []
for i, row in travel.iterrows():
    counts.append(len(row['tags'].split(' ')))

counts[0:10]

[3, 4, 5, 2, 2, 4, 4, 4, 5, 5]

In [61]:
pd.Series(counts, index=travel.index).describe()

count    19279.000000
mean         3.388869
std          1.117759
min          1.000000
25%          3.000000
50%          3.000000
75%          4.000000
max          5.000000
dtype: float64

In [73]:
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
from random import shuffle
from sklearn.model_selection import train_test_split

In [63]:
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens

In [79]:
sentences = []
for i, row in travel.iterrows():
    tags = tokenize_text(row['tags'])
    sentences.append(LabeledSentence(words=tokenize_text(row['content']), tags=tags))

In [82]:
sentences[0:2]

[LabeledSentence(words=['fianc', 'looking', 'good', 'caribbean', 'cruise', 'october', 'wondering', 'islands', 'best', 'see', 'cruise', 'line', 'take', 'seems', 'like', 'lot', 'cruises', 'run', 'month', 'due', 'hurricane', 'season', 'looking', 'good', 'options', 'edit', 'travelling', '2012'], tags=['caribbean', 'cruising', 'vacations']),
 LabeledSentence(words=['one', 'definition', 'questions', 'also', 'one', 'interests', 'personally', 'find', 'guide', 'take', 'safely', 'amazon', 'jungle', 'love', 'explore', 'amazon', 'would', 'attempt', 'without', 'guide', 'least', 'first', 'time', 'prefer', 'guide', 'going', 'ambush', 'anything', 'edit', 'want', 'go', 'anywhere', 'touristy', 'start', 'end', 'points', 'open', 'trip', 'take', 'places', 'likely', 'see', 'travellers', 'tourists', 'definitely', 'require', 'good', 'guide', 'order', 'safe'], tags=['guides', 'extreme-tourism', 'amazon-river', 'amazon-jungle'])]

In [152]:
def jaccard_similarity(labels, preds):
    lset = set(labels)
    pset = set(preds)
    return len(lset.intersection(pset)) / len(lset.union(pset))

def test(test_sents, model):
    results = []
    for test_sent in test_sents:
        pred_vec = model.infer_vector(test_sent.words)
        pred_tags = model.docvecs.most_similar([pred_vec], topn=5)
        results.append(jaccard_similarity(test_sent.tags, [p[0] for p in pred_tags]))
    return np.array(results)

def train(sentences):
    model = Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=2)
    model.build_vocab(sentences)
    train_sents, test_sents = train_test_split(sentences, test_size=0.2, random_state=42)
    alpha = 0.025
    min_alpha = 0.001
    num_epochs = 40
    alpha_delta = (alpha - min_alpha) / num_epochs

    for epoch in range(num_epochs):
        start_time = datetime.datetime.now()
        shuffle(train_sents)
        model.alpha = alpha
        model.min_alpha = alpha
        model.train(train_sents)
        alpha -= alpha_delta
        end_time = datetime.datetime.now()
        accuracy = test(test_sents, model).mean()
        print("Complete epoch {}: {}; Accuracy: {}".format(epoch, end_time - start_time, accuracy)))
        
    return model

In [153]:
model = train(sentences)

Complete epoch 0: 0:00:22.384340
Accuracy: 0.013084946980175198
Complete epoch 1: 0:00:23.021383
Accuracy: 0.029032367615095825
Complete epoch 2: 0:00:21.820058
Accuracy: 0.047092965817032206
Complete epoch 3: 0:00:21.733202
Accuracy: 0.05797962770862149
Complete epoch 4: 0:00:21.699064
Accuracy: 0.07111397204109861
Complete epoch 5: 0:00:21.770630
Accuracy: 0.08717418329710859
Complete epoch 6: 0:00:21.539986
Accuracy: 0.09841475663571098
Complete epoch 7: 0:00:21.633756
Accuracy: 0.09654856583020482
Complete epoch 8: 0:00:21.294289
Accuracy: 0.11105162846604753
Complete epoch 9: 0:00:22.929119
Accuracy: 0.11202537377329908
Complete epoch 10: 0:00:22.108524
Accuracy: 0.1142836560627017
Complete epoch 11: 0:00:22.355787
Accuracy: 0.11760614256075874
Complete epoch 12: 0:00:21.833686
Accuracy: 0.12392406392017388
Complete epoch 13: 0:00:21.347218
Accuracy: 0.13143009451360074
Complete epoch 14: 0:00:21.338783
Accuracy: 0.11166497892379634
Complete epoch 15: 0:00:27.074268
Accuracy: 0.13