In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%pip install --upgrade pip
%pip install --disable-pip-version-check \
    torch==1.13.1 \
    torchdata==0.5.1 --quiet

%pip install \
    transformers==4.27.2 \
    datasets==2.11.0 \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    loralib==0.1.1 \
    peft==0.3.0 --quiet

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import pandas as pd
import numpy as np
import string
from nltk.corpus import stopwords
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer
from sklearn.pipeline import Pipeline


importing dataset

In [None]:
columns = ['question', 'answer']
df = pd.read_csv('/kaggle/input/simple-dialogs-for-chatbot/dialogs.txt', sep='\t', names=columns)

In [None]:

df.head(10)

EDA
checking for missing value
check datatype,get overview

In [None]:
print(df.dtypes)
print(df.isnull().sum())

convert object to string

In [None]:
df['question'] = df['question'].astype("string")
df['answer'] = df['answer'].astype("string")
print(df.dtypes)


word frequency analysis

In [None]:
import re
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import nltk

# Download stopwords if not already downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [None]:
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    words = text.split()  # Tokenize (split into words)
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    return words


In [None]:
all_questions = df['question'].dropna().apply(preprocess_text).sum()
all_answers = df['answer'].dropna().apply(preprocess_text).sum()


In [None]:
question_freq = Counter(all_questions)
answer_freq = Counter(all_answers)

# Get the 10 most common words
print("Most Common Words in Questions:", question_freq.most_common(10))
print("Most Common Words in Answers:", answer_freq.most_common(10))


In [None]:
def plot_wordcloud(word_freq, title):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freq)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title(title)
    plt.show()

plot_wordcloud(question_freq, "Most Common Words in Questions")
plot_wordcloud(answer_freq, "Most Common Words in Answers")


using spacy for lemmatization

In [None]:
!pip install spacy
import spacy


In [None]:
!python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")


In [None]:
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    text = re.sub(r'\d+', '', text)  # Remove numbers
    doc = nlp(text)  # Process text with spaCy
    words = [token.lemma_ for token in doc if token.text not in stop_words]
    return " ".join(words)

df['clean_question'] = df['question'].dropna().apply(clean_text)
df['clean_answer'] = df['answer'].dropna().apply(clean_text)


convert string to embeddings

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")
question_embeddings = model.encode(df['clean_question'].tolist())


retrieving the top-matching answer using FAISS and generating a response using an LLM.

In [None]:
!pip install faiss-cpu sentence-transformers transformers


In [None]:
import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from transformers import pipeline

# Load sentence transformer model for embeddings
embedder = SentenceTransformer("all-MiniLM-L6-v2")


In [None]:


# Encode all questions into embeddings
question_embeddings = embedder.encode(df['question'].tolist())

# Convert embeddings to FAISS index
dimension = question_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)  # L2 distance index
index.add(np.array(question_embeddings))  # Add embeddings to index


In [None]:
import torch
from transformers import pipeline

# Load the LLaMA 2 model for text generation
llm = pipeline(
    "text-generation",
    model="tiiuae/falcon-7b-instruct",
    device=0 if torch.cuda.is_available() else -1
)
def get_similar_answer(user_question, k=3):
    # Convert user question to an embedding
    user_embedding = embedder.encode([user_question])

    # Search FAISS index for top K similar questions
    _, idx = index.search(np.array(user_embedding), k)
    
    # Retrieve top-matching answers
    top_answers = [df['answer'].iloc[i] for i in idx[0]]

    # Combine retrieved answers for LLM input
    prompt = f"User Question: {user_question}\nRetrieved Answers: {' '.join(top_answers)}\nProvide a helpful response based on the retrieved answers."

    # Generate a response using LLM
    response = llm(prompt, max_length=100, num_return_sequences=1)[0]["generated_text"]
    
    return response

# Test the chatbot
user_input = "What is machine learning?"
print(get_similar_answer(user_input))


finetuning with LoRA

In [None]:
pip install peft transformers datasets


In [None]:
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
from datasets import Dataset
from transformers import Trainer, TrainingArguments

# Load the tokenizer and model for Falcon 7B
model_name = "tiiuae/falcon-7b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# LoRA Configuration
config = LoraConfig(
    r=16,  # rank (hyperparameter, adjust based on your need)
    lora_alpha=32,  # scaling factor
    target_modules=["q_proj", "v_proj"],  # specific layers to apply LoRA
    lora_dropout=0.1,  # dropout rate
)

# Apply LoRA to the model
model = get_peft_model(model, config)

# Load your custom Q&A data 
data = {
    'question': df['question'].tolist(),
    'answer': df['answer'].tolist()
}

# Convert to Hugging Face dataset
dataset = Dataset.from_dict(data)

# Tokenize the dataset
def tokenize_data(example):
    prompt = f"Question: {example['question']}\nAnswer: {example['answer']}"
    return tokenizer(prompt, truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(tokenize_data, batched=True)

# Prepare Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,  # You can provide a separate validation dataset
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
trainer.save_model("./fine_tuned_model")

# Load the fine-tuned model for inference
model = AutoModelForCausalLM.from_pretrained("./fine_tuned_model")

# LLM pipeline for text generation after fine-tuning
llm = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

def get_similar_answer(user_question, k=3):
    # Convert user question to an embedding
    user_embedding = embedder.encode([user_question])

    # Search FAISS index for top K similar questions
    _, idx = index.search(np.array(user_embedding), k)
    
    # Retrieve top-matching answers
    top_answers = [df['answer'].iloc[i] for i in idx[0]]

    # Combine retrieved answers for LLM input
    prompt = f"User Question: {user_question}\nRetrieved Answers: {' '.join(top_answers)}\nProvide a helpful response based on the retrieved answers."

    # Generate a response using fine-tuned LLM
    response = llm(prompt, max_length=100, num_return_sequences=1)[0]["generated_text"]
    
    return response

# Test the fine-tuned chatbot
user_input = "What is machine learning?"
print(get_similar_answer(user_input))
