In [11]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [12]:
data = pd.read_csv('/kaggle/input/dataset/train_dataset.csv')
X = data['instruction'].tolist()
y = data['intent'].tolist()
categories = data['category'].tolist()

In [13]:
le_intent = LabelEncoder()
y_encoded = le_intent.fit_transform(y)

In [14]:
intent=set(y)

In [None]:
X_train, X_val, y_train, y_val, cat_train, cat_val = train_test_split(
    X, y_encoded, categories, test_size=0.2, random_state=42)

# Intent classifier model --bert model

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(le_intent.classes_))

In [None]:
def encode_data(texts, categories, labels, max_length=128):
    input_ids = []
    attention_masks = []
    for text, category in zip(texts, categories):
        # Combine category and text
        combined_text = f"{category} [SEP] {text}"
        encoded = tokenizer.encode_plus(
            combined_text,
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length',
            return_attention_mask=True,
            truncation=True,
            return_tensors='pt'
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)

    return input_ids, attention_masks, labels

In [None]:
train_inputs, train_masks, train_labels = encode_data(X_train, cat_train, y_train)
val_inputs, val_masks, val_labels = encode_data(X_val, cat_val, y_val)


In [None]:
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_loader = DataLoader(train_data, batch_size=128, shuffle=True)
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_loader = DataLoader(val_data, batch_size=128, shuffle=False)


In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)




In [None]:
epochs = 3
for epoch in range(epochs):
    model.train()
    for batch in train_loader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    # Validation
    model.eval()
    val_accuracy = 0
    for batch in val_loader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        val_accuracy += (predictions == inputs['labels']).float().mean()

    val_accuracy /= len(val_loader)
    print(f"Epoch {epoch+1}/{epochs}, Validation Accuracy: {val_accuracy:.4f}")


**load intent classifier model from hugging face hub   --checking model and evaluating**

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("sammanamgain/customer_intent_classifier")
model = AutoModelForSequenceClassification.from_pretrained("sammanamgain/customer_intent_classifier")

In [6]:
def predict_intent(text, category):
    combined_text = f"{category} [SEP] {text}"
    inputs = tokenizer(combined_text, return_tensors='pt', truncation=True, padding=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    predicted_class = torch.argmax(outputs.logits, dim=-1).item()
    return le_intent.inverse_transform([predicted_class])[0]

In [16]:
test_text = "what do i need to do to correct my shippign address"
test_category = "SHIPPING"  # Using the actual category name
predicted_intent = predict_intent(test_text, test_category)
print(f"Predicted intent: {predicted_intent}")


Predicted intent: change_shipping_address


# Customer Response model training --gpt-2****

In [None]:
pip install huggingface_hub


In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
model.push_to_hub("customer_intent_classifier")

In [None]:
tokenizer.push_to_hub("customer_intent_classifier")

In [2]:
pip install assemblyai

Collecting assemblyai
  Downloading assemblyai-0.30.0-py3-none-any.whl.metadata (26 kB)
Downloading assemblyai-0.30.0-py3-none-any.whl (70 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.2/70.2 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: assemblyai
Successfully installed assemblyai-0.30.0
Note: you may need to restart the kernel to use updated packages.


***audio to text converter***

In [4]:

import assemblyai as aai

aai.settings.api_key = "4585e5f674ca4ab5996c1d007f7fc6d0"
transcriber = aai.Transcriber()

transcript = transcriber.transcribe("https://soundcloud.com/user-948661042/crazy-noisy-bizarre-town?in=user-276538827/sets/random-audio-clips&utm_source=clipboard&utm_medium=text&utm_campaign=social_sharing")


print(transcript.text)

None


In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")

In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM, AdamW

In [None]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:
df = pd.read_csv('/kaggle/input/dataset/train_dataset.csv')

In [None]:
df['input'] = df['instruction'] + " " + df['category'] + " " + df['intent']
df = df[['input', 'response']]

In [None]:
# Split the dataset into training and validation sets
train_df = df.sample(frac=0.9, random_state=42)
val_df = df.drop(train_df.index)

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
# Create a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.input = dataframe.input
        self.response = dataframe.response
        self.max_len = max_len

    def __len__(self):
        return len(self.input)

    def __getitem__(self, index):
        input_text = self.input.iloc[index]
        response_text = self.response.iloc[index]
        inputs = self.tokenizer.encode_plus(
            input_text,
            response_text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        return {
            'input_ids': inputs.input_ids.flatten(),
            'attention_mask': inputs.attention_mask.flatten(),
            'labels': inputs.input_ids.flatten()  # Use input_ids as labels for causal LM
        }

In [None]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
EPOCHS = 5
LEARNING_RATE = 2e-5

# Create DataLoader
train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
val_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=VALID_BATCH_SIZE)

# Optimizer
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

In [None]:
for epoch in range(EPOCHS):
    model.train()
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()


    model.eval()
    val_accuracy = 0
    for batch in val_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        val_accuracy += (predictions == batch['labels']).float().mean()

    val_accuracy /= len(val_loader)
    print(f"Epoch {epoch+1}/{EPOCHS}, Validation Accuracy: {val_accuracy:.4f}")

In [None]:
import matplotlib.pyplot as plt

train_losses = []
val_losses = []

for epoch in range(EPOCHS):
    model.train()
    total_train_loss = 0
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_train_loss += loss.item()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    avg_train_loss = total_train_loss / len(train_loader)
    train_losses.append(avg_train_loss)
    
    # Validation
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_loader)
    val_losses.append(avg_val_loss)

    print(f"Epoch {epoch+1}/{EPOCHS}, Training Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")

# Plotting the losses
plt.plot(range(EPOCHS), train_losses, label='Training Loss')
plt.plot(range(EPOCHS), val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss')
plt.show()


# Uploading model to hugging Face Hub


In [None]:
model.push_to_hub("callcenter_response")

In [None]:
tokenizer.push_to_hub("callcenter_response")

In [4]:
instruction = "how to open a account"
category = "ACCOUNT"
intent = "create_account"


input_text = f"{instruction} {category} {intent} [SEP]"


inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to("cuda")


model.config.pad_token_id = model.config.eos_token_id


generated_ids = model.generate(
    inputs['input_ids'], 
    attention_mask=inputs['attention_mask'], 
    max_new_tokens=200,  
    do_sample=True, 
    top_k=50,  
    top_p=0.95,  
    temperature=0.7, 
    pad_token_id=model.config.eos_token_id  # Explicitly set pad_token_id
)


result = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

response = result.split('[SEP]')[-1].strip()
print(response)


indeed! I'm here to guide you through the process of opening a {{Account Type}} account. To get started, you can visit our website and click on the "Sign Up" or "Create Account" button. You'll be directed to a registration page where you'll need to provide your email address and password. Once you've filled in the required information, review your account settings, and then click on the "Create Account" button. This will allow you to select the {{Account Type}} account option from the available options. If you encounter any difficulties or have any questions along the way, please don't hesitate to reach out. We're always here to assist you and ensure a smooth account creation experience.


# text to audio converter


In [None]:
pip install gtts


In [None]:
from gtts import gTTS

# Your text
text = '''fantastic! I'm here to assist you with opening a {{Account Type}} account. To get started, please visit our website and look for the "Sign Up" or "Create Account" button. Click on it, and you'll be directed to a registration page. Fill in your personal information, such as your name, email address, and password, and choose the account type you'd like to open. Once you've completed the registration process, you'll start receiving our personalized sign-up form, which will provide you with the necessary information, such as your contact details and payment information. If you encounter any difficulties or have any questions along the way, don't hesitate to reach out to our customer support team. They are available {{Customer Support Hours}} at {{Customer Support Phone Number}} or through the Live Chat on our website at {{Website URL}}. We're here to ensure a smooth and hassle-free account creation process for you!"'''

# Create a gTTS object
tts = gTTS(text, lang='en')

# Save the audio file
tts.save("output1.mp3")
