In [None]:
import re
from datetime import datetime
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import numpy as np
import emoji
import string

def preprocess_text(text):
    """Clean and preprocess the text data."""
    # Remove emoji characters
    text = emoji.replace_emoji(text, '')
    # Remove punctuation except periods and question marks
    text = ''.join(ch for ch in text if ch not in string.punctuation or ch in '.?')
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text.strip()

def parse_whatsapp_txt(file_path):
    """Parse WhatsApp chat from a txt file into a structured format."""
    # Regular expression for WhatsApp message format
    pattern = r'(\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}(?:\s?[AaPpMm]{2})?)\s-\s([^:]+):\s(.+)'

    messages = []
    current_message = ''

    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                match = re.match(pattern, line.strip())
                if match:
                    # If we find a new message, save the previous one (if any)
                    if current_message:
                        messages.append(current_message)
                    # Start a new message
                    timestamp, sender, message = match.groups()
                    current_message = {
                        'timestamp': timestamp,
                        'sender': sender.strip(),
                        'message': preprocess_text(message.strip())
                    }
                elif current_message:
                    # Append continued message text
                    current_message['message'] += ' ' + preprocess_text(line.strip())

            # Add the last message
            if current_message:
                messages.append(current_message)

    except UnicodeDecodeError:
        # Try with different encoding if utf-8 fails
        with open(file_path, 'r', encoding='latin-1') as file:
            # Repeat the same process...
            for line in file:
                match = re.match(pattern, line.strip())
                if match:
                    if current_message:
                        messages.append(current_message)
                    timestamp, sender, message = match.groups()
                    current_message = {
                        'timestamp': timestamp,
                        'sender': sender.strip(),
                        'message': preprocess_text(message.strip())
                    }
                elif current_message:
                    current_message['message'] += ' ' + preprocess_text(line.strip())

            if current_message:
                messages.append(current_message)

    return pd.DataFrame(messages)

class WhatsAppChatbot:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(
            max_features=1000,
            stop_words='english',
            ngram_range=(1, 2)
        )
        self.classifier = MultinomialNB()
        self.responses = {}
        self.common_responses = {}

    def train(self, chat_df, target_user):
        """Train the chatbot on WhatsApp chat data for a specific user."""
        # Get conversation pairs (previous message and response)
        conversation_pairs = []
        labels = []

        # Build response patterns
        for i in range(1, len(chat_df)):
            prev_message = chat_df.iloc[i-1]['message'].lower()
            current_message = chat_df.iloc[i]['message']
            current_sender = chat_df.iloc[i]['sender']

            if current_sender == target_user:
                conversation_pairs.append(prev_message)
                labels.append(current_message)

                # Store response patterns
                if prev_message in self.responses:
                    if current_message not in self.responses[prev_message]:
                        self.responses[prev_message].append(current_message)
                else:
                    self.responses[prev_message] = [current_message]

                # Track common responses
                if current_message in self.common_responses:
                    self.common_responses[current_message] += 1
                else:
                    self.common_responses[current_message] = 1

        # Transform messages to TF-IDF features
        if conversation_pairs:
            X = self.vectorizer.fit_transform(conversation_pairs)
            y = [1] * len(labels)
            self.classifier.fit(X, y)

    def respond(self, message):
        """Generate a response to a given message."""
        message = preprocess_text(message.lower())

        # Direct match
        if message in self.responses:
            return np.random.choice(self.responses[message])

        # Try to find similar messages
        try:
            message_vector = self.vectorizer.transform([message])
            similarity_scores = self.classifier.predict_proba(message_vector)[0]

            if max(similarity_scores) > 0.2:  # Similarity threshold
                # Find messages with similar patterns
                for trained_message in self.responses:
                    if (message in trained_message or
                        trained_message in message or
                        any(word in trained_message.split() for word in message.split())):
                        return np.random.choice(self.responses[trained_message])

            # If no good match, return most common relevant response
            return max(self.common_responses.items(), key=lambda x: x[1])[0]

        except (ValueError, AttributeError):
            return "I'm not sure how to respond to that."

def create_chatbot_from_txt(file_path, target_user):
    """Create and train a chatbot from a WhatsApp chat txt file."""
    # Parse chat data
    chat_df = parse_whatsapp_txt(file_path)

    # Create and train chatbot
    chatbot = WhatsAppChatbot()
    chatbot.train(chat_df, target_user)

    return chatbot

# Example usage
if __name__ == "__main__":
    # Create chatbot from txt file
    file_path = 'whatsapp_chat.txt'  # Replace with your chat file path
    chatbot = create_chatbot_from_txt(file_path, "Rohan Sadaphule")

    # Test the chatbot
    test_messages = ["hii", "my new no.", "okay"]
    for message in test_messages:
        response = chatbot.respond(message)
        print(f"Message: {message}")
        print(f"Response: {response}\n")

Message: hii
Response: Aata Kay msg kartoy

Message: my new no.
Response: Okay

Message: okay
Response: download hotay thamb



In [None]:
!pip install emoji

Collecting emoji
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.0-py3-none-any.whl (586 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/586.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m583.7/586.9 kB[0m [31m18.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m586.9/586.9 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.0


In [None]:
#chat with the person/bot
while True:
    message=input("Enter your msg: ")
    response = chatbot.respond(message)
    print(response)
    if message.lower() == 'exit':
        break

Enter your msg: Hello
Kasa gela interview
Enter your msg: Konta interview
Cusrow Wadia
Enter your msg: Clg la yetoy ka

Enter your msg: Udya jaycha ka?
Lavkar sang ticket kadhaycha kaltay
Enter your msg: Exit
Thik aahe
