In [58]:
# Importing the required libaries
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout,Conv1D,Flatten 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
import string


# Loading the listings data set

listings_data = pd.read_csv('/home/rreinhaus/code/rreinhaus/listings.csv')

In [44]:
title_data = listings_data[['id', 'name']][listings_data['review_scores_rating'] > 4]
title_data

Unnamed: 0,id,name
0,13913,Holiday London DB Room Let-on going
1,15400,Bright Chelsea Apartment. Chelsea!
2,17402,Superb 3-Bed/2 Bath & Wifi: Trendy W1
4,25123,Clean big Room in London (Room 1)
5,33332,Beautiful Ensuite Richmond-upon-Thames borough
...,...,...
66399,53618204,Studio located at Goodge street.
66405,53618701,Lovely 1-bedroom in Acton
66423,53622933,"Luxury, Modern & Cosy 2 Bedroom London Apartment"
66517,53657036,DreamyApartment-CloseToHydePar-KensingtonOlympia


In [45]:
# return corpus
def corpus(text):
    text = ''.join(e for e in text if e not in string.punctuation).lower()
    
    text = text.encode('utf8').decode('ascii', 'ignore')
    return text

corpus = [corpus(str(e)) for e in title_data['name']]

In [46]:
data = " ".join(corpus)

In [47]:
def tokens(text):
    tokens = text.split()
    table = str.maketrans("", "", string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [word.lower() for word in tokens]
    return tokens

tokens = tokens(data)

In [48]:
len(title_data['name'][0])

35

In [49]:
length = 5+1
lines = []

for i in range(length, len(tokens)):
    seq = tokens[i-length:i]
    line = " ".join(seq)
    lines.append(line)

print(len(lines))

253416


In [51]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)
sequences = np.array(sequences)

In [52]:
sequences

array([[ 377,    4,  841,    3, 7267, 2350],
       [   4,  841,    3, 7267, 2350,   17],
       [ 841,    3, 7267, 2350,   17,   72],
       ...,
       [   7, 7266,  125,   12,   10,    4],
       [7266,  125,   12,   10,    4,   26],
       [ 125,   12,   10,    4,   26,   47]])

In [53]:
X, y = sequences[:, :-1], sequences[:,-1]

In [54]:
len_ = int(0.8*len(X))

X_train = X[:len_]
X_test = X[len_:]

y_train = y[:len_]
y_test = y[len_:]

In [55]:
vocab_size = len(tokenizer.word_index) + 1

In [56]:
y_train = to_categorical(y_train, num_classes=vocab_size)
y_test = to_categorical(y_test, num_classes=vocab_size)

In [57]:
import gensim.downloader as api
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# load a word2vec embedding
word2vec_transfer = api.load("glove-wiki-gigaword-50")

# Function to convert a sentence (list of words) into a matrix representing the words in the embedding space
def embed_sentence_with_TF(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec:
            embedded_sentence.append(word2vec[word])
        
    return np.array(embedded_sentence)

# Function that converts a list of sentences into a list of matrices
def embedding(word2vec, sentences):
    embed = []
    
    for sentence in sentences:
        embedded_sentence = embed_sentence_with_TF(word2vec, sentence)
        embed.append(embedded_sentence)
        
    return embed

# Embed the training and test sentences
X_train_embed = embedding(word2vec_transfer, X_train)
X_test_embed = embedding(word2vec_transfer, X_test)

# Pad the training and test embedded sentences
X_train_embed = pad_sequences(X_train_embed, dtype='float32', padding='pre')
X_test_embed = pad_sequences(X_test_embed, dtype='float32', padding='pre')

In [None]:
def init_cnn_model():
    model = Sequential()
    model.add(Conv1D(16, 3))
    model.add(Flatten())
    model.add(Dense(5,))
    model.add(Dense(vocab_size, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model_cnn = init_cnn_model()


es = EarlyStopping(patience=5, restore_best_weights=True)

model_cnn.fit(X_train_embed, y_train, 
          epochs=20, 
          batch_size=32,
          validation_split=0.2,
          callbacks=[es]
         )


res = model_cnn.evaluate(X_test_embed, y_test, verbose=0)

print(f'The accuracy evaluated on the test set is of {res[1]*100:.3f}%')