In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x7f56c0016570>

In [2]:
import pandas as pd
#import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [4]:
import sys
sys.path.insert(0, '/home/ooo/projects/housingprice/code')
#sys.path.insert(0, '/home/gangwu/cs224n/housingprice/code')
import vocab
from vocab import get_glove

In [5]:
glove_path = '../data/glove/glove.6B.50d.txt'
embedding_size = 50
emb_matrix, word2id, id2word = get_glove(glove_path, embedding_size)

  0%|          | 7015/2196017 [00:00<00:31, 70142.54it/s]

Loading GLoVE vectors from file: ../data/glove/glove.6B.50d.txt


 18%|█▊        | 400000/2196017 [00:03<00:16, 105737.65it/s]


In [6]:
from vocab import PAD_ID, UNK_ID
import re

def split_by_whitespace(sentence):                               
    words = []                                                             
    for space_separated_fragment in sentence.strip().split():       
        words.extend(re.split(" ", space_separated_fragment))              
    return [w for w in words if w] 

def cleanup_tokens(tokens):
    words = []
    for tk in tokens:
        tmp = ''
        for c in tk:
            if not c.isalnum():
                continue
            tmp += c.lower()
        if tmp:
            words.append(tmp)
    return words

def sentence_to_token_ids(sentence, word2id):                              
    """Turns an already-tokenized sentence string into word indices        
    e.g. "i do n't know" -> [9, 32, 16, 96]
    Note any token that isn't in the word2id mapping gets mapped to the id for UNK
    """
    tokens = split_by_whitespace(sentence) # list of strings
    clean_tokens = cleanup_tokens(tokens)
    ids = [word2id.get(w, UNK_ID) for w in clean_tokens]                         
    return tokens, clean_tokens, ids

def padded(idsList, tgtLength):
    if len(idsList) > tgtLength:
        return idsList[:tgtLength]
    return idsList + [PAD_ID]*(tgtLength - len(idsList))

In [7]:
df_train = pd.read_csv('../csvFiles/clean.csv')

priceList = list()
discList = list()

for idx, row in df_train.iterrows():
    priceList.append(row['sold_price'])
    tmpDisc = row['Disc']
    tokens, clean_tokens, ids = sentence_to_token_ids(tmpDisc, word2id)
    paddedIdsList = padded(ids, 100)
    discList.append(paddedIdsList)

In [8]:
print(len(priceList))
print(len(discList))

1063
1063


In [9]:
print(priceList[5])
print(discList[5])

485000
[57343, 7877, 407, 1868, 1239, 940, 394, 4489, 6, 39, 12389, 145, 3130, 165, 19, 4233, 5, 1397, 707, 8, 2, 1, 2403, 19, 9, 2387, 1656, 5, 1888, 746, 115574, 61288, 8, 758, 7, 7212, 929, 52, 4060, 15, 15974, 7613, 6060, 1, 11194, 2213, 12661, 12, 196, 280, 7672, 33, 1158, 6, 1280, 2, 758, 865, 48, 892, 77, 47044, 165, 636, 1656, 424, 69409, 257, 24, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [55]:
class LSTMDisc(nn.Module):
    
    def __init__(self, embedding_dim, hidden_dim, vocab_size):
        super(LSTMDisc, self).__init__()
        self.hidden_dim = hidden_dim
        
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.word_embeddings.weight.data.copy_(torch.from_numpy(emb_matrix))
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        
        # target size is 1, as the sold price is one number
        self.hidden2tag = nn.Linear(hidden_dim, 1)
        self.hidden = self.init_hidden()
        
    def init_hidden(self):
        return (torch.zeros(1, 1, self.hidden_dim),
                torch.zeros(1, 1, self.hidden_dim))
    
    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, self.hidden = self.lstm(embeds.view(len(sentence), 1, -1), self.hidden)
        #print(lstm_out)
        #print(lstm_out.shape)
        mean_lstm_out = torch.mean(lstm_out, 0)
        #print(mean_lstm_out)
        #print(mean_lstm_out.shape)
        tag_space = self.hidden2tag(mean_lstm_out)
        #tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_space

In [56]:
EMBEDDING_DIM = 50
HIDDEN_DIM = 50

In [57]:
model = LSTMDisc(EMBEDDING_DIM, HIDDEN_DIM, len(emb_matrix))

In [59]:
with torch.no_grad():
    inputs = torch.tensor(discList[5], dtype=torch.long)
    tag_scores = model(inputs)
    print(tag_scores)

tensor(1.00000e-02 *
       [[-6.4950]])


In [60]:
loss_function = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [68]:
for epoch in range(2):
    for sentence, tags in zip(discList, priceList):

        model.zero_grad()
        model.hidden = model.init_hidden()

        in_sentence = torch.tensor(sentence, dtype=torch.long)
        tag_scores = model(in_sentence)
        
        in_tags = torch.tensor(tags, dtype=torch.float).view(1, 1)

        loss = loss_function(tag_scores, in_tags)
        print(loss)
        loss.backward()
        optimizer.step()

tensor(1.0865e+10)
tensor(1.7348e+10)
tensor(3.7285e+10)
tensor(6.7291e+11)
tensor(4.8202e+11)
tensor(4.2519e+08)
tensor(1.5543e+07)
tensor(7.6955e+08)
tensor(5.9461e+09)
tensor(7.6415e+09)
tensor(1.8671e+10)
tensor(6.8387e+09)
tensor(3.2905e+11)
tensor(1.9977e+11)
tensor(1.7355e+10)
tensor(5.3907e+10)
tensor(1.8483e+11)
tensor(7.5751e+10)
tensor(1.5562e+10)
tensor(3.4565e+10)
tensor(1.0935e+11)
tensor(1.1602e+10)
tensor(2.9714e+08)
tensor(1.1213e+07)
tensor(1.2276e+10)
tensor(5.3333e+09)
tensor(5.5183e+10)
tensor(2.4688e+09)
tensor(3.8186e+10)
tensor(4.7335e+08)
tensor(7.1575e+09)
tensor(2.0038e+11)
tensor(3.9935e+10)
tensor(2.6340e+08)
tensor(1.1445e+10)
tensor(4.3165e+07)
tensor(8.4926e+10)
tensor(8.3828e+10)
tensor(1.0888e+11)
tensor(8.8328e+09)
tensor(2.0509e+10)
tensor(8.3838e+09)
tensor(9.7714e+09)
tensor(1.8795e+10)
tensor(4.9637e+10)
tensor(3.3759e+10)
tensor(2.3831e+10)
tensor(5.7182e+09)
tensor(7.1099e+10)
tensor(5.9258e+09)
tensor(5.0622e+11)
tensor(3.0833e+11)
tensor(1.409

KeyboardInterrupt: 