In [None]:
import re
from datetime import datetime
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import emoji
import string
import sys

In [None]:
def preprocess_text(text):
    """Clean and preprocess the text data."""
    # Remove emoji characters
    text = emoji.replace_emoji(text, '')
    # Keep only periods and question marks from punctuation
    text = ''.join(ch for ch in text if ch not in string.punctuation or ch in '.?')
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text.strip()

In [None]:
def parse_whatsapp_txt(file_path):
    """Parse WhatsApp chat from a txt file into a structured format."""
    pattern = r'(\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}(?:\s?[AaPpMm]{2})?)\s-\s([^:]+):\s(.+)'
    messages = []
    current_message = ''

    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                match = re.match(pattern, line.strip())
                if match:
                    if current_message:
                        messages.append(current_message)
                    timestamp, sender, message = match.groups()
                    if message.strip() == '<Media omitted>':
                        current_message = ''
                        continue
                    current_message = {
                        'timestamp': timestamp,
                        'sender': sender.strip(),
                        'message': preprocess_text(message.strip())
                    }
                elif current_message:
                    current_message['message'] += ' ' + preprocess_text(line.strip())
            if current_message:
                messages.append(current_message)
    except UnicodeDecodeError:
        with open(file_path, 'r', encoding='latin-1') as file:
            for line in file:
                match = re.match(pattern, line.strip())
                if match:
                    if current_message:
                        messages.append(current_message)
                    timestamp, sender, message = match.groups()
                    current_message = {
                        'timestamp': timestamp,
                        'sender': sender.strip(),
                        'message': preprocess_text(message.strip())
                    }
                elif current_message:
                    current_message['message'] += ' ' + preprocess_text(line.strip())
            if current_message:
                messages.append(current_message)
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        sys.exit(1)

    if not messages:
        print("Error: No valid messages found in the chat file.")
        sys.exit(1)

    return pd.DataFrame(messages)

In [None]:
class WhatsAppChatbot:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(max_features=1000, stop_words='english', ngram_range=(1, 2))
        self.classifier = MultinomialNB()
        self.responses = {}
        self.common_responses = {}
        self.trained_messages = []

    def train(self, chat_df, target_user):
        """Train the chatbot on WhatsApp chat data for a specific user."""
        conversation_pairs = []
        labels = []

        for i in range(1, len(chat_df)):
            prev_message = chat_df.iloc[i-1]['message'].lower()
            current_message = chat_df.iloc[i]['message']
            current_sender = chat_df.iloc[i]['sender']

            if current_sender == target_user:
                conversation_pairs.append(prev_message)
                labels.append(current_message)
                self.trained_messages.append(prev_message)

                if prev_message in self.responses:
                    if current_message not in self.responses[prev_message]:
                        self.responses[prev_message].append(current_message)
                else:
                    self.responses[prev_message] = [current_message]

                self.common_responses[current_message] = self.common_responses.get(current_message, 0) + 1

        if conversation_pairs:
            self.X = self.vectorizer.fit_transform(conversation_pairs)
            y = [1] * len(labels)
            self.classifier.fit(self.X, y)
        else:
            print(f"Warning: No messages from {target_user} found for training.")

    def respond(self, message):
        """Generate a response to a given message."""
        message = preprocess_text(message.lower())

        # Direct match
        if message in self.responses:
            return np.random.choice(self.responses[message])

        # Cosine similarity-based matching
        try:
            message_vector = self.vectorizer.transform([message])
            similarities = cosine_similarity(message_vector, self.X)[0]
            max_similarity = np.max(similarities)

            if max_similarity > 0.3:  # Adjustable threshold
                best_match_idx = np.argmax(similarities)
                best_match = self.trained_messages[best_match_idx]
                return np.random.choice(self.responses[best_match])

            # Fallback to keyword matching
            for trained_message in self.responses:
                if any(word in trained_message.split() for word in message.split()):
                    return np.random.choice(self.responses[trained_message])

            # Default to most common response
            if self.common_responses:
                return max(self.common_responses.items(), key=lambda x: x[1])[0]
            return "I don’t have enough data to respond properly."

        except (ValueError, AttributeError):
            return "Hmm, I’m not sure what to say to that."

In [None]:
file_path = input("Enter the path to your WhatsApp chat file: ")  # e.g., 'whatsapp_chat.txt'
chat_df = parse_whatsapp_txt(file_path)
print("Available senders:", chat_df['sender'].unique())
target_user = input("Enter the target user’s name: ")
chatbot = WhatsAppChatbot()
chatbot.train(chat_df, target_user)
print("Chatbot trained successfully!")

In [None]:
print("Start chatting! Type 'exit' to quit.")
while True:
    message = input("You: ")
    if message.lower() == 'exit':
        print("Bot: Goodbye!")
        break
    response = chatbot.respond(message)
    print(f"{target_user}(Bot): {response}")