In [2]:
import torch 
import re
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from gensim.models import Word2Vec
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
from tqdm import tqdm
import torch.nn.functional as F
import torch.nn.utils.rnn as rnn_utils


In [3]:
random_seed = 42

In [4]:
X = pd.read_csv('cleaned_wine_training_data.csv')
# the data is already cleaned, so we can just tokenize the words
# tokenize the description data
X['description'] = X['description'].apply(lambda x: word_tokenize(x))
X.dropna(subset= ['description'], inplace=True)
X.dropna(subset= ['price'], inplace=True)

# train a word2vec model on the description data
prelim = Word2Vec(X['description'], min_count=1, workers=3, window=3, sg=1)

In [7]:
# how many words are in my word2vec model
print(len(X['description']))
print((prelim.wv.vectors.shape))
print(len(prelim.wv))





93964
(36973, 100)
36973


In [9]:
from torch.utils.data import DataLoader

# Assuming you have already transformed reviews into matrices
# You might need to convert them into torch tensors before passing to DataLoader



In [11]:
import matplotlib.pyplot as plt

In [13]:
input_data_embeddings = []
max_length = 0

print(X['description'].head())
for review in X['description']:
    embedding_sequence = [prelim.wv[word] for word in review if word in prelim.wv]
    input_data_embeddings.append(embedding_sequence)
    max_length = max(max_length, len(embedding_sequence))

# Pad sequences to the same length
input_data_tensors = [torch.tensor(seq, dtype=torch.float32) for seq in input_data_embeddings]
input_data_tensors_padded = pad_sequence(input_data_tensors, batch_first=True)

# You can check the shape of input_data_tensors_padded to see the maximum length
print("Shape of padded input data:", input_data_tensors_padded.shape)

# Assuming X["price"] contains your target prices
target_tensors = torch.tensor(X["price"], dtype=torch.float32)

0    [rich, ripe, blackberry, cassis, leathery, aro...
2    [distinctive, dessert, wine, open, inky, dark,...
3    [great, price, chardonnay, taste, like, cost, ...
4    [grapefruit, lemon, star, anise, aroma, lead, ...
5    [made, exclusively, pinot, noir, rich, highly,...
Name: description, dtype: object


  input_data_tensors = [torch.tensor(seq, dtype=torch.float32) for seq in input_data_embeddings]


Shape of padded input data: torch.Size([93964, 75, 100])


In [18]:
#create a CNN architecture that can handle variable length sequences

from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class CNNVariableLength2(nn.Module):
    def __init__(self, input_dim, output_dim, num_filters, kernel_sizes, embedding_dim):
        super(CNNVariableLength2, self).__init__()
        
        # self.embedding = nn.Embedding(input_dim, embedding_dim)
        
        # self.convs = nn.ModuleList([
        #     nn.Conv1d(in_channels=embedding_dim, 
        #               out_channels=num_filters, 
        #               kernel_size=kernel_size)
        #     for kernel_size in kernel_sizes
        # ])

        self.conv1 = nn.Conv2d(in_channels=1, out_channels=num_filters[0], kernel_size=kernel_sizes[0])
        self.conv2 = nn.Conv2d(in_channels=num_filters[0], out_channels=num_filters[1], kernel_size=kernel_sizes[1])
        self.conv3 = nn.Conv2d(in_channels=num_filters[1], out_channels=num_filters[2], kernel_size=kernel_sizes[2])
        # self.conv4 = nn.Conv2d(in_channels=num_filters[2], out_channels=num_filters[3], kernel_size=kernel_sizes[3])
        # self.conv5 = nn.Conv2d(in_channels=num_filters[3], out_channels=num_filters[4], kernel_size=kernel_sizes[4])
        # self.conv6 = nn.Conv2d(in_channels=num_filters[4], out_channels=num_filters[5], kernel_size=kernel_sizes[5])
        

        self.global_pool = nn.AdaptiveMaxPool2d(1)  # Global pooling
        self.relu = nn.functional.relu

        self.fc = nn.Linear(6, output_dim)

    def forward(self, x):
       
        lengths = (x.any(dim=2) != 0).sum(dim=1)
        lengths, perm_idx = lengths.sort(0, descending=True)
        
        x = x[perm_idx]
    
        x = pack_padded_sequence(x, lengths, batch_first=True)

        x, _ = rnn_utils.pad_packed_sequence(x, batch_first=True)

        x = x.unsqueeze(1)

        x = self.relu(self.conv1(x))

        x = self.relu(self.conv2(x))

        x = self.relu(self.conv3(x))

        # x = self.relu(self.conv4(x))

        # x = self.relu(self.conv5(x))

        # x = self.relu(self.conv6(x))


        x = self.global_pool(x)

        x = x.squeeze(2)
     
        x = x.squeeze(2)

        x = x.flatten(1)
       

        output = self.fc(x)  # [batch_size, output_dim]
      
        return output, x

In [19]:
from torch.nn.utils.rnn import pad_sequence

class MyDataset(torch.utils.data.Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __getitem__(self, index):
        return self.data[index], self.labels[index]

    def __len__(self):
        return len(self.data)

# Assuming `data` and `labels` are lists of tensors
# dataset = MyDataset(data, labels)

def collate_fn(batch):
    data, labels = zip(*batch)
    data = pad_sequence(data, batch_first=True)
    labels = torch.stack(labels)
    return data, labels

# data_loader = torch.utils.data.DataLoader(dataset, batch_size=32, collate_fn=collate_fn)

In [45]:
import torch.nn as nn
import numpy as np
from sklearn.utils import shuffle

class CustomLoss(nn.Module):
    def __init__(self):
        super(CustomLoss, self).__init__()

    def forward(self, inputs, targets, reps, criterion=nn.MSELoss()):
        resps_np = reps.detach().numpy()
        targets_np = targets.detach().numpy()
        resps_np, targets_np = shuffle(resps_np, targets_np, random_state=0)
        shuffled_reps = torch.tensor(resps_np, dtype=torch.float32)

        shuffled_targets = torch.tensor(targets_np, dtype=torch.float32)
        rep_loss = abs(criterion(reps, shuffled_reps) - criterion(targets, shuffled_targets))

        loss = criterion(inputs, targets) + rep_loss
        return loss.mean()

In [21]:
import torch.optim as optim
from tqdm import tqdm


def train_custom(model, data_loader, num_epochs=10, learning_rate=0.001):
    print("TRAINING, learning rate:", learning_rate)
    model.train()
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    crit = CustomLoss()


    loss_list = []

    for epoch in range(num_epochs):
        running_loss = 0.0
        for i, data in tqdm(enumerate(data_loader), total=len(data_loader)):
            inputs, labels = data
            optimizer.zero_grad()
            outputs,rep = model(inputs)
            outputs = outputs.squeeze(1)
            loss = crit(outputs, labels, rep)
        
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
     
        loss_list.append(running_loss / len(data_loader))
        print(epoch + 1, running_loss / len(data_loader))

    print('Finished Training')
    plt.plot(loss_list)
    plt.savefig('loss.png')




In [42]:
import torch.optim as optim
from tqdm import tqdm


def test(model, data_loader):
    model.eval()
    criterion = nn.MSELoss()
    # optimizer = optim.Adam(model.parameters(), lr=0.001)


    loss_list = []
    predictions = []
    true_valus = []

    running_loss = 0.0
    correct = 0
    total = 0
    correct10 = 0
    total10 = 0
    
    for i, data in tqdm(enumerate(data_loader), total=len(data_loader)):
        inputs, labels = data
        outputs,rep = model(inputs)
        outputs = outputs.squeeze(1)
        for i in range(len(outputs)):
            predictions.append(outputs[i])
            true_valus.append(labels[i])
        
        loss = criterion(outputs, labels)

        running_loss += loss.item()
        #add to correct if within 20% of the true price
        correct += sum(abs(outputs - labels) < 0.2 * labels)
        correct10 += sum(abs(outputs - labels) < 10 + labels)
        total += len(labels)



    loss_list.append(running_loss / len(data_loader))
    print("Testing : ", running_loss / len(data_loader))
    print("Correct: ", correct / total)
    print("Correct10: ", correct10 / total)

    print('Finished Testing')

    return running_loss / len(data_loader)





In [23]:
# create a dataset and dataloader using the non-padded input data
# batch_size = 32
dataset = MyDataset(input_data_tensors, target_tensors)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=32, collate_fn=collate_fn)


In [25]:
X_val = pd.read_csv('cleaned_wine_validation_data.csv')
X_val['description'] = X_val['description'].apply(lambda x: word_tokenize(x))
X_val.dropna(subset= ['description'], inplace=True)
X_val.dropna(subset= ['price'], inplace=True)


input_data_embeddings = []
max_length = 0

print(X_val['description'].head())
for review in X_val['description']:
    embedding_sequence = [prelim.wv[word] for word in review if word in prelim.wv]
    input_data_embeddings.append(embedding_sequence)
    max_length = max(max_length, len(embedding_sequence))

# Pad sequences to the same length
input_data_tensors_val = [torch.tensor(seq, dtype=torch.float32) for seq in input_data_embeddings]
input_data_tensors_padded_val = pad_sequence(input_data_tensors_val, batch_first=True)

# You can check the shape of input_data_tensors_padded to see the maximum length
print("Shape of padded input data:", input_data_tensors_padded_val.shape)

# Assuming X["price"] contains your target prices
target_tensors_val = torch.tensor(X_val["price"], dtype=torch.float32)

0    [made, cool, part, napa, valley, cool, vintage...
1    [initial, aroma, boxwood, give, way, melon, cr...
2    [smoky, leathery, barrel, flavor, soft, cabern...
3    [ebullient, aroma, ripe, yellow, peach, tanger...
4    [informal, red, open, aroma, suggest, darkskin...
Name: description, dtype: object
Shape of padded input data: torch.Size([22699, 72, 100])


In [26]:
X_test = pd.read_csv('cleaned_wine_testing_data.csv')
X_test['description'] = X_test['description'].apply(lambda x: word_tokenize(x))
X_test.dropna(subset= ['description'], inplace=True)
X_test.dropna(subset= ['price'], inplace=True)


input_data_embeddings = []
max_length = 0

print(X_test['description'].head())
for review in X_test['description']:
    embedding_sequence = [prelim.wv[word] for word in review if word in prelim.wv]
    input_data_embeddings.append(embedding_sequence)
    max_length = max(max_length, len(embedding_sequence))

# Pad sequences to the same length
input_data_tensors_test = [torch.tensor(seq, dtype=torch.float32) for seq in input_data_embeddings]
input_data_tensors_padded_test = pad_sequence(input_data_tensors_test, batch_first=True)

# You can check the shape of input_data_tensors_padded to see the maximum length
print("Shape of padded input data:", input_data_tensors_padded_test.shape)

# Assuming X["price"] contains your target prices
target_tensors_test = torch.tensor(X_test["price"], dtype=torch.float32)

0    [aroma, recall, ripe, dark, berry, toast, whif...
3    [catarratto, one, sicily, widely, farmed, whit...
4    [right, starting, block, oaky, wine, dripping,...
5    [fruity, lightly, herbaceous, fine, textured, ...
6    [show, jellylike, flavor, orange, pear, earthy...
Name: description, dtype: object
Shape of padded input data: torch.Size([19983, 68, 100])


In [27]:
# create a dataset and dataloader using the non-padded input data
# batch_size = 32
dataset_val = MyDataset(input_data_tensors_val, target_tensors_val)
data_loader_val = torch.utils.data.DataLoader(dataset_val, batch_size=32, collate_fn=collate_fn)
dataset_test = MyDataset(input_data_tensors_test, target_tensors_test)
data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=32, collate_fn=collate_fn)


In [44]:
learning_rates = [ .001,.01,.1,1,10,100]

val_losses = []
best_model = None
best_loss = 1000000
learn = 0

for i in learning_rates:
    model = CNNVariableLength2(input_dim=prelim.wv.vectors.shape[0], output_dim=1, num_filters=[10,8,6], kernel_sizes=[2,3,4], embedding_dim=prelim.wv.vectors.shape[1])
    
    optimizer = optim.Adam(model.parameters(), lr=i)
    criterion = nn.MSELoss()
    train_custom(model, data_loader, num_epochs=3, learning_rate=i)


    l = test(model, data_loader_val)
    if l < best_loss:
        best_loss = l
        best_model = model
        learn = i
    val_losses.append(l)

print("Losses over Learning Rates: ", val_losses)
print("Best Learning Rate = ", learn)
print("Best Loss = ", best_loss)
print("Best Model: ")
print(best_model)

TRAINING, learning rate: 0.001


 84%|████████▎ | 2458/2937 [01:44<00:20, 23.60it/s]


KeyboardInterrupt: 

In [41]:
x = test(best_model, data_loader_val)
print(x)

100%|██████████| 710/710 [00:49<00:00, 14.28it/s]


Testing :  1366.3885808810383
Correct:  tensor(0.1938)
Correct10:  tensor(0.8916)
Finished Testing
<class 'list'>
1366.3885808810383
