In [1]:
import pandas as pd
import torch.nn as nn
import numpy as np

In [2]:
def process_data(data_path, clarity_path, con_path):
    x = pd.read_csv(data_path, header=None, encoding='utf-8')[:-1]
    clarity = pd.read_csv(clarity_path, header=None, encoding='utf-8')[:-1]
    con = pd.read_csv(con_path, header=None, encoding='utf-8')[:-1]
    return (x, clarity, con)

x = pd.read_csv("training/data_train.csv", header=None, encoding='utf-8')[:-1]
clarity = pd.read_csv("training/clarity_train.labels", header=None, encoding='utf-8')[:-1]
con = pd.read_csv("training/conciseness_train.labels", header=None, encoding='utf-8')[:-1]

# train_x, train_clarity, train_con = process_data("training/data_train.csv", "training/clarity_train.labels", "training/conciseness_train.labels")
# val_x, val_clarity, val_con = 

clarity.columns = ["clarity"]
con.columns = ["conciseness"]
data = x.join(clarity).join(con)

import re
# data[6].apply(lambda x: re.sub(r"<[0-9a-zA-Z/]*>", "", x).strip())
data.columns = ["country", "id", "title", "cat_1", "cat_2", "cat_3", "description", "price", "product_type", "clarity", "conciseness"]
data.fillna("", inplace=True)
data.description = data.description.apply(lambda x: re.sub(r"<[0-9a-zA-Z/]*>", "", x).strip())


In [3]:
from nltk import word_tokenize
word2id = {'<pad>':0}
id2word = {0: '<pad>'}

for title in data.title:
    words = word_tokenize(title)
    for word in words:
        word = word.lower()
        if not word in word2id:
            word2id[word] = len(word2id)
            id2word[len(word2id)-1] = word
            
for des in data.description:
    words = word_tokenize(des)
    for word in words:
        word = word.lower()
        if not word in word2id:
            word2id[word] = len(word2id)
            id2word[len(word2id)-1] = word

word2id['<unk>'] = len(word2id)
id2word[len(word2id)-1] = '<unk>'

print "vocab_size =", len(word2id)

vocab_size = 85975


In [58]:
import pickle
with open("word2id.pickle", "wb") as f:
    pickle.dump(word2id, f)

In [4]:
cat2id = {}

def update_car2id(series, cat2id):
    for cat in series:
        if not cat in cat2id:
            cat2id[cat] = len(cat2id)
    return cat2id

cat2id = update_car2id(data.cat_1, cat2id)
cat2id = update_car2id(data.cat_2, cat2id)
cat2id = update_car2id(data.cat_3, cat2id)
print "category count =", len(cat2id)

category count = 245


In [5]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

data.price = scaler.fit_transform(data[['price']])

In [6]:
data.title = data.title.apply(lambda x: word_tokenize(x))
data.description = data.description.apply(lambda x: word_tokenize(x))

data_count = len(data)

title_length = max(map(len, data.title.tolist()))
description_length = max(map(len, data.description.tolist()))
# this becomes more than 4000, let's limit this
description_length = 300

x_title = np.zeros((data_count, title_length), dtype=np.int)
x_desc = np.zeros((data_count, description_length), dtype=np.int)

for i in xrange(data_count):
    for j, token in enumerate(data.title[i]):
        token = token.lower()
        x_title[i][j] = word2id[token]
    
    for j, token in enumerate(data.description[i]):
        token = token.lower()
        x_desc[i][j] = word2id[token]
        
        # cut the description
        if j==description_length-1:
            break
x_price = np.array(data.price.tolist(), np.float32)

train_y1 = np.array(data.clarity.tolist(), dtype=np.float32)
train_y2 = np.array(data.conciseness.tolist(), dtype=np.float32)

In [29]:
import torch.nn as nn
import torch

class ClarityModel(nn.Module):
    def __init__(self, emb_size, vocab_size, t_ks, d_ks, t_length, d_length):
        super(ClarityModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size, t_length, d_length)
        self.title_cnn = nn.Conv2d(1, 1, (t_ks, emb_size))
        self.desc_cnn = nn.Conv2d(1, 1, (d_ks, emb_size))
        self.relu = nn.ReLU()
        self.linear = nn.Linear(t_length-(t_ks-1)+d_length-(d_ks-1)+1, 1) # stride is taken as 1 in this case
        self.sigmoid = nn.Sigmoid()
        
        self.t_length = t_length
        self.d_length = d_length
        self.emb_size = emb_size
        
        self.loss = nn.BCELoss()
        
    def forward(self, title_idx, desc_idx, price):
        # price should be (b_size,1)
        batch_size = title_idx.shape[0]
        t_emb = self.embedding(title_idx) #b_size, t_length, emb_size
        d_emb = self.embedding(desc_idx) #b_size, d_length, emb_size
        t_out = self.title_cnn(t_emb.view(-1,1,self.t_length,self.emb_size)).view(batch_size, -1)
        d_out = self.desc_cnn(d_emb.view(-1,1,self.d_length,self.emb_size)).view(batch_size, -1)
        t_out = self.relu(t_out)
        d_out = self.relu(d_out)
        out = torch.cat((t_out, d_out, price), dim=1)
        out = self.linear(out)
        final = self.sigmoid(out)
        return final
        
    def get_loss(self, prediction, target):
        return self.loss(prediction, target)

In [8]:
def generate_batch_idx(data_count, batch_size, i = 0):
    j = i + batch_size
    
    output = []
    
    while (i<data_count):
        output.append((i,j))
        i += batch_size
        j += batch_size
        
        if j>data_count:
            j = data_count
    
    return output

generate_batch_idx(102, 30, 20)

[(20, 50), (50, 80), (80, 102)]

In [51]:
import torch.nn.functional as F
class CNN_Text(nn.Module):
    def __init__(self, args):
        super(CNN_Text, self).__init__()
        self.args = args
        
        V = args['embed_num']
        D = args['embed_dim']
        C = args['class_num']
        Ci = 1
        Co = args['kernel_num']
        Ks = args['kernel_sizes']

        self.embed = nn.Embedding(V, D)
        # self.convs1 = [nn.Conv2d(Ci, Co, (K, D)) for K in Ks]
        self.convs1 = nn.ModuleList([nn.Conv2d(Ci, Co, (K, D)) for K in Ks])
        self.dropout = nn.Dropout(args['dropout'])
        self.fc1 = nn.Linear(len(Ks)*Co, C)

    def conv_and_pool(self, x, conv):
        x = F.relu(conv(x)).squeeze(3)  # (N, Co, W)
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        return x

    def forward(self, x):
        x = self.embed(x)  # (N, W, D)
        
#         if self.args.static:
#             x = Variable(x)

        x = x.unsqueeze(1)  # (N, Ci, W, D)
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1]  # [(N, Co, W), ...]*len(Ks)
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  # [(N, Co), ...]*len(Ks)
        x = torch.cat(x, 1)
        x = self.dropout(x)  # (N, len(Ks)*Co)
        logit = self.fc1(x)  # (N, C)
        return F.sigmoid(logit)

In [63]:
import torch.optim as optim
from sklearn.metrics import mean_squared_error

# model = CNN_Text({'embed_num':len(word2id), 'embed_dim':50, 'class_num':1, 'kernel_num':3, 'kernel_sizes':(3,4,5), 'dropout':0.5})
# loss_fn = nn.BCELoss()

model = ClarityModel(50, len(word2id), 3, 3, title_length, description_length)
optimizer = optim.SGD(model.parameters(), lr=0.17, weight_decay=1e-4)

epochs = 100
batch_size = 2000

## since I don't have access to validation or test data, create a small devset using train set 80:20 split
data_count = int(len(train_y1) * 0.8)

batch_idx_list = generate_batch_idx(data_count, batch_size)
val_idx_list = generate_batch_idx(len(train_y1), batch_size, data_count)

print "Batch size: {}, Batch count:{}".format(batch_size, len(batch_idx_list))
for epoch in xrange(epochs):
    loss_val = 0
    loss_mse = 0
    
    for i,j in batch_idx_list:
        xt_batch = torch.tensor(x_title[i:j], dtype=torch.long)
        xd_batch = torch.tensor(x_desc[i:j], dtype=torch.long)
        xp_batch = torch.tensor(x_price[i:j]).view(-1, 1)
        
        
        prediction = model(xt_batch, xd_batch, xp_batch)
        target = torch.tensor(train_y2[i:j]).view(-1,1)
        loss = model.get_loss(prediction, target)
        loss_val += loss.tolist()
        loss.backward()
        optimizer.step()

        ##############
#         prediction = model(torch.cat((xt_batch, xd_batch), dim=1))
#         target = torch.tensor(train_y2[i:j]).view(-1,1)
#         loss = loss_fn(prediction, target)
#         loss_val += loss.tolist()
#         loss.backward()
#         optimizer.step()
        ##############
        
        
        mse = mean_squared_error(prediction.view(-1).tolist(), target.tolist()) * (j-i)
        loss_mse += mse

    loss_mse /= data_count
    loss_mse = loss_mse ** 0.5
    
        
    print "Training loss at epoch {}: {}".format(epoch+1, loss_val)
    print "Training set score at epoch {}: {}".format(epoch+1, loss_mse)
    
    loss_val = 0
    
    for i,j in val_idx_list:
        t_val = torch.tensor(x_title[i:j], dtype=torch.long)
        d_val = torch.tensor(x_desc[i:j], dtype=torch.long)
        p_val = torch.tensor(x_price[i:j]).view(-1, 1)

        prediction = model(t_val, d_val, p_val)
#         prediction = model(torch.cat((t_val, d_val), dim=1))
        target = train_y2[i:j]
        loss = mean_squared_error(prediction.view(-1).tolist(), target.tolist()) * (j-i)
        loss_val += loss
    
    loss_val /= (len(train_y1) - data_count)
    loss_val = loss_val ** 0.5
    print "Validation score at epoch {}: {}".format(epoch+1, loss_val)
    print "------------------\n"

Batch size: 2000, Batch count:15
Training loss at epoch 1: 39.62837255
Training set score at epoch 1: 0.5690676096
Validation score at epoch 1: 0.540638334962
------------------

Training loss at epoch 2: 29.6202298403
Training set score at epoch 2: 0.528727451185
Validation score at epoch 2: 0.504024833161
------------------

Training loss at epoch 3: 18.502684772
Training set score at epoch 3: 0.48364497401
Validation score at epoch 3: 0.483924226088
------------------

Training loss at epoch 4: 12.9414686561
Training set score at epoch 4: 0.465933511945
Validation score at epoch 4: 0.475922996279
------------------

Training loss at epoch 5: 11.1642659903
Training set score at epoch 5: 0.467059622197
Validation score at epoch 5: 0.46615489
------------------

Training loss at epoch 6: 10.2210943103
Training set score at epoch 6: 0.464437077062
Validation score at epoch 6: 0.463026592092
------------------

Training loss at epoch 7: 9.86550658941
Training set score at epoch 7: 0.4639

Training loss at epoch 55: 130.722166061
Training set score at epoch 55: 0.562080545109
Validation score at epoch 55: 0.556322102189
------------------

Training loss at epoch 56: 130.722166061
Training set score at epoch 56: 0.562080545109
Validation score at epoch 56: 0.556322102189
------------------

Training loss at epoch 57: 130.722166061
Training set score at epoch 57: 0.562080545109
Validation score at epoch 57: 0.556322102189
------------------

Training loss at epoch 58: 130.722166061
Training set score at epoch 58: 0.562080545109
Validation score at epoch 58: 0.556322102189
------------------

Training loss at epoch 59: 130.722166061
Training set score at epoch 59: 0.562080545109
Validation score at epoch 59: 0.556322102189
------------------

Training loss at epoch 60: 130.722166061
Training set score at epoch 60: 0.562080545109
Validation score at epoch 60: 0.556322102189
------------------

Training loss at epoch 61: 130.722166061
Training set score at epoch 61: 0.5620805