In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/car-description-dataset-summarized/car_Summarized_Description.csv


In [2]:
data = pd.read_csv('/kaggle/input/car-description-dataset-summarized/car_Summarized_Description.csv')
data.head(2)

Unnamed: 0.1,Unnamed: 0,car type,car name,year,url,Pros,Cons,Full Description,rating,current price Min,current price Max,base msrp,engine,drivetrain,dimensions,num_words,Full Name,GPT-2 Summarization,num_words_summarized
0,0,jeep,jeep wrangler,1998,https://www.edmunds.com/jeep/wrangler/1998/rev...,"['Unmatched off-road capability, overflowing w...","['Soft top is fun, but still a pain in the $%!...",Jeep has improved off-road capability by incre...,4.3 out of 5 stars,"$2,423","$4,074",N/A \n,"Inline 4 cylinder \n Horsepower: 120 hp @ 5,40...",Type: four wheel drive \n Transmission: 5-spee...,Length: 147.7 in. / Height: 69.6 in. \n Overal...,427,jeep wrangler 1998,Jeep has improved off-road capability by incre...,427
1,1,toyota,toyota 4runner,2005,https://www.edmunds.com/toyota/4runner/2005/re...,"['Powerful engine lineup, well mannered on pav...","[""Cargo capacity isn't much more than what man...",The base V6 now comes with a five-speed automa...,4.8 out of 5 stars,"$4,364","$6,983","$27,795 \n","V6 cylinder \n Horsepower: 245 hp @ 5,200 rpm ...",Type: rear wheel drive \n Transmission: 5-spee...,Length: 189.0 in. / Height: 68.5 in. \n Overal...,110,toyota 4runner 2005,The base V6 now comes with a five-speed automa...,110


In [3]:
data = data[['GPT-2 Summarization', 'current price Min', 'current price Max']]
data['current price Min'] = data['current price Min'].str.replace('$', '')
data['current price Max'] = data['current price Max'].str.replace('$', '')
data['current price Min'] = data['current price Min'].str.replace(',', '.')
data['current price Max'] = data['current price Max'].str.replace(',', '.')
data = data.dropna(subset=['current price Min', 'current price Max'])
data = data[data['current price Min'] != 'Not Available']
data['current price Min'] = pd.to_numeric(data['current price Min'])
data['current price Max'] = pd.to_numeric(data['current price Max'])


  data['current price Min'] = data['current price Min'].str.replace('$', '')
  data['current price Max'] = data['current price Max'].str.replace('$', '')


In [4]:
data

Unnamed: 0,GPT-2 Summarization,current price Min,current price Max
0,Jeep has improved off-road capability by incre...,2.423,4.074
1,The base V6 now comes with a five-speed automa...,4.364,6.983
2,"SR5 models have new 16-inch wheels, and a limi...",3.324,5.612
3,The 1998 four-wheel-drive Tacomas receive fres...,2.884,4.518
4,No changes for the Ford Ranger.,3.675,5.757
...,...,...,...
3149,The Toyota Camry receives a thorough overhaul ...,10.991,22.991
3150,"For 2016, the Chevrolet Equinox receives revis...",10.944,23.990
3151,"For 2014, the Chevy Suburban sees the heavy-du...",13.760,27.590
3152,The 2010 GMC Savana Cargo gets a remote vehicl...,7.876,10.648


In [5]:
data = data.reset_index(drop=True)

In [6]:
data.index

RangeIndex(start=0, stop=3043, step=1)

In [9]:
data.dtypes

GPT-2 Summarization     object
current price Min      float64
current price Max      float64
dtype: object

In [7]:
from tqdm.auto import tqdm

In [20]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel

# define the dataset class
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.tokenizer = tokenizer
        self.text = data['GPT-2 Summarization']
        self.labels = data[['current price Min', 'current price Max']].values.astype('float32')
    
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, idx):        text = self.text[idx]
        label = torch.tensor(self.labels[idx])
        
        
        encoding = self.tokenizer(text, max_length=512, truncation=True, padding='max_length', return_tensors='pt')
        input_ids = encoding['input_ids'][0]
        attention_mask = encoding['attention_mask'][0]
        return input_ids, attention_mask, label

# define the model class
class CustomModel(nn.Module):
    def __init__(self, model_name, num_labels):
        super(CustomModel, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.1)
        self.linear1 = nn.Linear(self.bert.config.hidden_size, 64)
        self.linear2 = nn.Linear(64, num_labels)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        pooled_output = self.linear1(pooled_output)
        logits = self.linear2(pooled_output)
        return logits

# define the training function
def train(model, train_loader, val_loader, device, num_epochs, lr):
    # define the optimizer and loss function
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2, factor=0.1, verbose=True)
    criterion = nn.MSELoss()
    model.to(device)
    
    
    best_val_loss = float('inf')
    early_stopping_rounds = 3  # Stop training if validation loss doesn't improve after 3 epochs
    early_stopping_count = 0
    
    for epoch in tqdm(range(num_epochs)):
        # train for one epoch
        model.train()
        train_loss = 0
        for input_ids, attention_mask, labels in tqdm(train_loader):
            optimizer.zero_grad()
            logits = model(input_ids.to(device), attention_mask.to(device))
            loss = criterion(logits, labels.to(device))
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        # validate after each epoch
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for input_ids, attention_mask, labels in val_loader:
                logits = model(input_ids.to(device), attention_mask.to(device))
                loss = criterion(logits, labels.to(device))
                val_loss += loss.item()
        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}")
        scheduler.step(val_loss)
        
        if val_loss >= best_val_loss:
            print("early_stopping_count: ", early_stopping_count)
            early_stopping_count += 1
        else:
            best_val_loss = val_loss
            early_stopping_count = 0
        
        # If the early stopping count exceeds the allowed rounds, stop training
        if early_stopping_count >= early_stopping_rounds:
            print(f'Validation loss did not improve for {early_stopping_rounds} epochs. Stopping training.')
            break


In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [22]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(data, test_size=0.1)

In [23]:
train_data = train_data.reset_index(drop=True)

In [24]:
test_data = test_data.reset_index(drop=True)

In [25]:
train_data.index

RangeIndex(start=0, stop=2738, step=1)

In [17]:
train_data.loc[719]

GPT-2 Summarization    Optional color-coordinated fender flares are a...
current price Min                                                  3.052
current price Max                                                  4.952
Name: 719, dtype: object

In [67]:
data.loc[2515]['GPT-2 Summarization']



In [26]:
# create the tokenizer and encode the data
#tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
train_dataset = CustomDataset(train_data, tokenizer)
test_dataset = CustomDataset(test_data, tokenizer)

# create the data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# create the model and train
model = CustomModel('bert-base-uncased', num_labels=2)



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [29]:
model.bert.config.max_position_embeddings

512

In [27]:
train(model, train_loader, test_loader, device, num_epochs=30, lr=2e-5)

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/172 [00:00<?, ?it/s]

Epoch 1/30, Train Loss: 738.9391, Val Loss: 385.3502


  0%|          | 0/172 [00:00<?, ?it/s]

Epoch 2/30, Train Loss: 521.2735, Val Loss: 231.1024


  0%|          | 0/172 [00:00<?, ?it/s]

Epoch 3/30, Train Loss: 399.3268, Val Loss: 162.2168


  0%|          | 0/172 [00:00<?, ?it/s]

Epoch 4/30, Train Loss: 343.6678, Val Loss: 120.6380


  0%|          | 0/172 [00:00<?, ?it/s]

Epoch 5/30, Train Loss: 301.2258, Val Loss: 99.3240


  0%|          | 0/172 [00:00<?, ?it/s]

Epoch 6/30, Train Loss: 278.7175, Val Loss: 82.3713


  0%|          | 0/172 [00:00<?, ?it/s]

Epoch 7/30, Train Loss: 255.4355, Val Loss: 68.1969


  0%|          | 0/172 [00:00<?, ?it/s]

Epoch 8/30, Train Loss: 239.6569, Val Loss: 63.4840


  0%|          | 0/172 [00:00<?, ?it/s]

Epoch 9/30, Train Loss: 225.1690, Val Loss: 57.3607


  0%|          | 0/172 [00:00<?, ?it/s]

Epoch 10/30, Train Loss: 218.1841, Val Loss: 59.8566
early_stopping_count:  0


  0%|          | 0/172 [00:00<?, ?it/s]

Epoch 11/30, Train Loss: 207.3363, Val Loss: 47.9358


  0%|          | 0/172 [00:00<?, ?it/s]

Epoch 12/30, Train Loss: 198.3892, Val Loss: 47.3931


  0%|          | 0/172 [00:00<?, ?it/s]

Epoch 13/30, Train Loss: 193.2616, Val Loss: 54.0904
early_stopping_count:  0


  0%|          | 0/172 [00:00<?, ?it/s]

Epoch 14/30, Train Loss: 191.9912, Val Loss: 48.5815
early_stopping_count:  1


  0%|          | 0/172 [00:00<?, ?it/s]

Epoch 15/30, Train Loss: 187.6062, Val Loss: 39.1048


  0%|          | 0/172 [00:00<?, ?it/s]

Epoch 16/30, Train Loss: 181.4841, Val Loss: 43.8389
early_stopping_count:  0


  0%|          | 0/172 [00:00<?, ?it/s]

Epoch 17/30, Train Loss: 1157.6082, Val Loss: 47.0047
early_stopping_count:  1


  0%|          | 0/172 [00:00<?, ?it/s]

Epoch 18/30, Train Loss: 174.0543, Val Loss: 42.3312
Epoch 00018: reducing learning rate of group 0 to 2.0000e-06.
early_stopping_count:  2
Validation loss did not improve for 3 epochs. Stopping training.


In [40]:
# model.save_pretrained('FineTune_bert_text_description_regression')

In [38]:
# define the file path and name for the saved model
model_path = 'FineTune_bert_text_description_regression.pt'
# save the model
torch.save(model.state_dict(), model_path)

In [39]:
# create a new instance of the model
new_model = CustomModel('bert-base-uncased', num_labels=2)

# load the saved model
new_model.load_state_dict(torch.load("FineTune_bert_text_description_regression.pt"))


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [41]:
%cd /kaggle/working

/kaggle/working


In [42]:
from IPython.display import FileLink
FileLink(r'FineTune_bert_text_description_regression.pt')

In [32]:
def predict(model, tokenizer, text):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    model.to(device)
    encoding = tokenizer(text, max_length=512, truncation=True, padding='max_length', return_tensors='pt').to(device)
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']
    with torch.no_grad():
        logits = model(input_ids, attention_mask)
        predictions = logits.cpu().numpy()
    return predictions


In [35]:
data.loc[1000]

GPT-2 Summarization    Highlights include a longer standard features ...
current price Min                                                  12.49
current price Max                                                  21.59
Name: 1000, dtype: object

In [36]:
predict(model, tokenizer, data.loc[1000]['GPT-2 Summarization'])

array([[11.503025, 17.170656]], dtype=float32)