<a href="https://colab.research.google.com/github/omnimanwani/DesignProject/blob/main/intentPrompt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import json
import pandas as pd
import logging
from tqdm import tqdm
import re
from google.colab import files

In [2]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [3]:
class RedialProcessor:
    def __init__(self):
        """Initialize the processor with intent categories and entity patterns."""
        self.intent_categories = {
            "movie recommendation": [
                "can you recommend", "what movies", "suggest", "looking for",
                "what should I watch", "similar to", "like movies about"
            ],
            "asking for preferences": [
                "do you like", "have you seen", "what do you think about",
                "are you into", "what's your favorite", "what kind of movies"
            ],
            "providing opinion": [
                "I think", "I like", "I love", "I enjoyed", "I didn't like",
                "it was", "in my opinion", "I felt"
            ],
            "requesting information": [
                "what is", "who stars in", "when was", "tell me about",
                "how long is", "where can I watch"
            ],
            "acknowledgment": [
                "okay", "thanks", "thank you", "got it", "I see",
                "that's helpful", "good to know"
            ],
            "greeting": [
                "hi", "hello", "hey", "good morning", "good evening",
                "how are you"
            ],
            "farewell": [
                "bye", "goodbye", "see you", "take care", "thanks again",
                "have a good"
            ],
            "chitchat": [
                "how's your day", "what's up", "nice weather",
                "that's interesting", "really"
            ]
        }

        self.entity_patterns = {
            'PERSON': [
                r'(?:Mr\.|Mrs\.|Ms\.|Dr\.|Prof\.) [A-Z][a-z]+(?:\s[A-Z][a-z]+)+',
                r'[A-Z][a-z]+(?:\s[A-Z][a-z]+)+',
                r'(?:actor|actress|director|producer)\s+[A-Z][a-z]+(?:\s[A-Z][a-z]+)+'
            ],
            'WORK_OF_ART': [
                r'(?:The\s|A\s|An\s)?[A-Z][^.!?]*?(?:movie|film|series|show)',
                r'(?:The\s|A\s|An\s)?[A-Z][^.!?]*?\(\d{4}\)',
                r'"[^"]+"|\'[^\']+\'',
                r'[A-Z][a-z0-9]+(?:\s+[A-Z][a-z0-9]+)*:\s[A-Z][a-z0-9]+(?:\s+[A-Z][a-z0-9]+)*'
            ],
            'ORG': [
                r'(?:Disney|Netflix|Amazon|HBO|Universal|Paramount|MGM|Sony|Warner Bros\.|Marvel|Pixar)',
                r'[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\s+(?:Studios|Pictures|Entertainment|Productions|Films)',
                r'[A-Z]{2,}(?:\s+[A-Z][a-z]+)*'
            ]
        }

    def clean_text(self, text):
        """Clean and preprocess the message text."""
        text = re.sub(r'@\d+', 'MOVIE_MENTION', text)
        text = ' '.join(text.split())
        return text

    def identify_intent(self, text):
        """Identify intent based on keyword matching and patterns."""
        text = text.lower()
        max_confidence = 0
        identified_intent = "chitchat"

        for intent, patterns in self.intent_categories.items():
            matches = sum(1 for pattern in patterns if pattern in text)
            if matches > 0:
                confidence = matches / len(patterns)
                if confidence > max_confidence:
                    max_confidence = confidence
                    identified_intent = intent

        return identified_intent, max_confidence

    def extract_entities(self, text):
        """Extract entities using regex patterns."""
        entities = []

        for entity_type, patterns in self.entity_patterns.items():
            for pattern in patterns:
                matches = re.finditer(pattern, text)
                for match in matches:
                    entities.append((match.group(), entity_type))

        return entities

    def extract_entities_and_intents(self, text):
        """Extract entities and classify intent from text."""
        try:
            cleaned_text = self.clean_text(text)
            entities = self.extract_entities(cleaned_text)
            intent, confidence = self.identify_intent(cleaned_text)

            return {
                'entities': entities,
                'intent': intent,
                'confidence': confidence
            }

        except Exception as e:
            logger.error(f"Error processing text: '{text}'. Error: {str(e)}")
            return {'entities': [], 'intent': 'unknown', 'confidence': 0.0}

    def process_dataset(self, input_path, output_filename):
        """Process the ReDial dataset and create augmented CSV."""
        try:
            data = []

            with open(input_path, 'r') as file:
                num_lines = sum(1 for _ in file)
                file.seek(0)

                for line in tqdm(file, total=num_lines, desc="Processing dialogues"):
                    dialogue = json.loads(line)

                    for message in dialogue['messages']:
                        text = message['text']
                        result = self.extract_entities_and_intents(text)

                        message_data = {
                            'conversation_id': dialogue['conversationId'],
                            'message_id': message['messageId'],
                            'sender_type': 'initiator' if message['senderWorkerId'] == dialogue['initiatorWorkerId'] else 'respondent',
                            'original_text': text,
                            'processed_text': self.clean_text(text),
                            'intent': result['intent'],
                            'intent_confidence': result['confidence']
                        }

                        for idx, (entity_text, entity_type) in enumerate(result['entities']):
                            message_data[f'entity_{idx+1}_text'] = entity_text
                            message_data[f'entity_{idx+1}_type'] = entity_type

                        data.append(message_data)

            df = pd.DataFrame(data)

            df.to_csv(output_filename, index=False)
            logger.info(f"Successfully processed dataset and saved to {output_filename}")
            files.download(output_filename)

        except Exception as e:
            logger.error(f"Error processing dataset: {str(e)}")
            raise

In [6]:
def main():
    """Main execution function."""
    try:

        processor = RedialProcessor()

        input_path = '/content/test_data.jsonl'
        output_filename = 'augmented_redial_data.csv'

        processor.process_dataset(input_path, output_filename)

        logger.info("Processing completed successfully!")

    except Exception as e:
        logger.error(f"Error in main execution: {str(e)}")
        raise

if __name__ == "__main__":
    main()


Processing dialogues: 100%|██████████| 1342/1342 [00:03<00:00, 381.59it/s]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>