In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x7f0febd1be50>

In [3]:
import pandas as pd
#import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [13]:
import sys
#sys.path.insert(0, '/home/ooo/projects/housingprice/code')
sys.path.insert(0, '/home/gangwu/cs224n/housingprice/code')
import vocab
from vocab import get_glove

In [14]:
glove_path = '../data/glove/glove.6B.50d.txt'
embedding_size = 50
emb_matrix, word2id, id2word = get_glove(glove_path, embedding_size)

  0%|          | 2188/2196017 [00:00<01:40, 21875.17it/s]

Loading GLoVE vectors from file: ../data/glove/glove.6B.50d.txt


 18%|█▊        | 400000/2196017 [00:10<00:47, 37742.55it/s]


In [30]:
from vocab import PAD_ID, UNK_ID
import re

def split_by_whitespace(sentence):                               
    words = []                                                             
    for space_separated_fragment in sentence.strip().split():       
        words.extend(re.split(" ", space_separated_fragment))              
    return [w for w in words if w] 

def cleanup_tokens(tokens):
    words = []
    for tk in tokens:
        tmp = ''
        for c in tk:
            if not c.isalnum():
                continue
            tmp += c.lower()
        if tmp:
            words.append(tmp)
    return words

def sentence_to_token_ids(sentence, word2id):                              
    """Turns an already-tokenized sentence string into word indices        
    e.g. "i do n't know" -> [9, 32, 16, 96]
    Note any token that isn't in the word2id mapping gets mapped to the id for UNK
    """
    tokens = split_by_whitespace(sentence) # list of strings
    clean_tokens = cleanup_tokens(tokens)
    ids = [word2id.get(w, UNK_ID) for w in clean_tokens]                         
    return tokens, clean_tokens, ids

def padded(idsList, tgtLength):
    if len(idsList) > tgtLength:
        return idsList[:tgtLength]
    return idsList + [PAD_ID]*(tgtLength - len(idsList))

In [31]:
df_train = pd.read_csv('../csvFiles/clean.csv')

priceList = list()
discList = list()

for idx, row in df_train.iterrows():
    priceList.append(row['sold_price'])
    tmpDisc = row['Disc']
    tokens, clean_tokens, ids = sentence_to_token_ids(tmpDisc, word2id)
    paddedIdsList = padded(ids, 100)
    discList.append(paddedIdsList)

In [32]:
print(len(priceList))
print(len(discList))

1063
1063


In [33]:
print(priceList[5])
print(discList[5])

485000
[57343, 7877, 407, 1868, 1239, 940, 394, 4489, 6, 39, 12389, 145, 3130, 165, 19, 4233, 5, 1397, 707, 8, 2, 1, 2403, 19, 9, 2387, 1656, 5, 1888, 746, 115574, 61288, 8, 758, 7, 7212, 929, 52, 4060, 15, 15974, 7613, 6060, 1, 11194, 2213, 12661, 12, 196, 280, 7672, 33, 1158, 6, 1280, 2, 758, 865, 48, 892, 77, 47044, 165, 636, 1656, 424, 69409, 257, 24, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [67]:
class LSTMDisc(nn.Module):
    
    def __init__(self, embedding_dim, hidden_dim, vocab_size):
        super(LSTMDisc, self).__init__()
        self.hidden_dim = hidden_dim
        
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.word_embeddings.weight.data.copy_(torch.from_numpy(emb_matrix))
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        
        # target size is 1, as the sold price is one number
        self.hidden2tag = nn.Linear(hidden_dim, 1)
        self.hidden = self.init_hidden()
        
    def init_hidden(self):
        return (torch.zeros(1, 1, self.hidden_dim),
                torch.zeros(1, 1, self.hidden_dim))
    
    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, self.hidden = self.lstm(embeds.view(len(sentence), 1, -1), self.hidden)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [68]:
EMBEDDING_DIM = 50
HIDDEN_DIM = 50

In [69]:
model = LSTMDisc(EMBEDDING_DIM, HIDDEN_DIM, len(emb_matrix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [73]:
with torch.no_grad():
    inputs = torch.tensor(discList[5], dtype=torch.long)
    tag_scores = model(inputs)
    print(tag_scores.shape)

torch.Size([100, 1])
