# Chat Based Recommendation System

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.33.2-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m44.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.2-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m88.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m59.3 MB/s[0m eta [36m0:00:0

In [19]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW, get_linear_schedule_with_warmup
import torch

def load_and_preprocess_data(file_path):
    df = pd.read_csv(file_path)

    selected_columns = ['Product', 'Main Category', 'Sub Category', 'Actual Price' ]

    text_data = df[selected_columns].apply(lambda row: ' , '.join(map(str, row)), axis=1)

    text_data = "\n".join(text_data)

    return text_data


data_path = "/content/sports.csv"
text_data = load_and_preprocess_data(data_path)

model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)


max_seq_length = 256
input_ids = tokenizer.encode(text_data, return_tensors='pt', max_length=max_seq_length, truncation=True)


learning_rate = 2e-4
epochs = 15
batch_size = 8

optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(input_ids) * epochs)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(epochs):
    model.train()
    for i in range(0, len(input_ids), batch_size):
        batch_input_ids = input_ids[i:i+batch_size].to(device)
        optimizer.zero_grad()

        outputs = model(input_ids=batch_input_ids, labels=batch_input_ids)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        scheduler.step()

        if i % 100 == 0:
            print(f"Epoch [{epoch+1}/{epochs}] - Batch [{i}/{len(input_ids)}] - Loss: {loss.item()}")




Epoch [1/15] - Batch [0/1] - Loss: 4.297889232635498
Epoch [2/15] - Batch [0/1] - Loss: 3.2711050510406494
Epoch [3/15] - Batch [0/1] - Loss: 2.777663230895996
Epoch [4/15] - Batch [0/1] - Loss: 2.349323034286499
Epoch [5/15] - Batch [0/1] - Loss: 1.9780586957931519
Epoch [6/15] - Batch [0/1] - Loss: 1.654369831085205
Epoch [7/15] - Batch [0/1] - Loss: 1.3463140726089478
Epoch [8/15] - Batch [0/1] - Loss: 1.194441795349121
Epoch [9/15] - Batch [0/1] - Loss: 0.8933115601539612
Epoch [10/15] - Batch [0/1] - Loss: 0.7515425086021423
Epoch [11/15] - Batch [0/1] - Loss: 0.6275886297225952
Epoch [12/15] - Batch [0/1] - Loss: 0.5562211871147156
Epoch [13/15] - Batch [0/1] - Loss: 0.45950597524642944
Epoch [14/15] - Batch [0/1] - Loss: 0.4424612522125244
Epoch [15/15] - Batch [0/1] - Loss: 0.38575735688209534


In [20]:
model.save_pretrained("gpt2-trained-sports")

print("Trained model saved at the default location")


Trained model saved at the default location


In [24]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model_path = "gpt2-trained-sports"

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained(model_path)

def generate_recommendation(user_query, max_response_length=50):
    input_ids = tokenizer.encode(user_query, return_tensors="pt")

    with torch.no_grad():
        output_ids = model.generate(input_ids, max_length=max_response_length, num_return_sequences=1)

    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return generated_text

input_text = "best cricket kit for boys?"
restaurant_recommendation = generate_recommendation(input_text)
print(restaurant_recommendation)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


best cricket kit for boys?

I think it's a good idea.

I think it's a good idea.

I think it's a good idea.

I think it's a good idea.

I think
