In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from indicnlp.tokenize import indic_tokenize
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
import re
from collections import defaultdict
from nltk.tokenize import sent_tokenize
from typing import Dict, List, Any

In [2]:
# Sample conversation
conversation = """
RA: नमस्ते श्री कुमार, मैं एक्स वाई जेड फाइनेंस से बोल रहा हूं । आपके लोन के बारे में बात करनी थी ।
RA: सर, आपका पिछले महीने का EMI अभी तक नहीं आया है । क्या कोई समस्या है?
RA: ओह , यह तो बरुा हुआ। लेकिन सर , आपको समझना होगा कि लोन का भगुतान समय पर करना बहुत जरूरी है ।
RA: हम समझते हैं आपकी स्थिति । क्या आप अगले हफ्ते तक कुछ भगुतान कर सकते है ?
RA: ठीक है , आधा भगुतान अगले हफ्ते तक कर दीजिए । बाकी का क्या प्लान है आपका ?
RA: ठीक है । तो हम ऐसा करते हैं आप अगले हफ्ते तक आधा EMI जमा कर दीजिए, और अगले महीने के 15 तारीख तक बाकी का भगुतान कर दीजि ए। क्या यह आपको स्वीकार है ?
RA: बहुत अच्छा । मैं आपको एक SMS भेज रहा हूं जिसमें भगुतान की डिटेल्स होंगी । कृपया इसका पालन करें और समय पर भगुतान करें ।
RA: आपका स्वागत है । अगर कोई और सवाल हो तो मुझे बताइएगा। अलविदा ।
B: हां, बोलिए। क्या बात है?
B: हां, थोड़ी दिक्कत है। मेरी नौकरी चली गई है और मैं नया काम ढूंढ रहा हूं।
B: मैं समझता हूं, लेकिन अभी मेरे पसै पैसे नहीं हैं। क्या कुछ समय मिल सकता है?
B: मैं कोशिश करूंगा, लेकिन परूा EMI नहीं दे पाऊंगा। क्या आधा भगुतान चलेगा?
B: मझु उम्मीद है कि अगले महीने तक मझु नया काम मिल जाएगा। तब मैं बाकी बकाया चकु दंगू।
B: हां, यह ठीक रहेगा। मैं इस प्लान का पालन करनेकी परूी कोशि  करूंगा।
B: ठीक है, धन्यवाद आपके समझने के लिए।
B: अलविदा।
"""

In [3]:

additional_stops = {
    "मैं", "हम", "आप", "यह", "वह", "मुझु", "मेरी", "आपके", 
    "है", "हूँ", "था", "हैं", "होगी", "कर सकते", 
    "और", "लेकिन", "या", "तो", "कि", 
    "में", "पर", "तक", "का", "की", "से", "के", 
    "यदि", "फिर", "जब", "ही", "जो", "अब", "क्या","हां","हूं",
    ",", "।", "?"
}

categories = {
    'payment': {'EMI', 'भुगतान', 'पैसे', 'रुपये', 'जमा'},
    'deadline': {'तारीख', 'समय', 'दिन', 'महीने', 'हफ्ते'},
    'problem': {'समस्या', 'दिक्कत', 'परेशानी', 'मुश्किल'},
    'agreement': {'ठीक', 'सहमत', 'स्वीकार', 'मंजूर'}
}
sentiment_lexicon = {
    'positive': {
        'ठीक', 'अच्छा', 'बढ़िया', 'सहमत', 'धन्यवाद', 'स्वीकार',
        'मदद', 'कृपया', 'शुक्रिया'
    },
    'negative': {
        'समस्या', 'दिक्कत', 'परेशानी', 'मुश्किल', 'नहीं', 'बकाया',
        'देरी', 'गलत', 'माफ़'
    },
    'neutral': {
        'है', 'हूं', 'था', 'थी', 'और', 'या', 'में', 'पर'
    }
}

In [4]:
# Initialize existing components
factory = IndicNormalizerFactory()
normalizer = factory.get_normalizer("hi")

In [5]:
def custom_tokenizer( text: str) -> List[str]:
    normalized_text = normalizer.normalize(text)
    tokens = indic_tokenize.trivial_tokenize(normalized_text)
    return [
        word for word in tokens 
        if word not in additional_stops
    ]

In [6]:
tfidf = TfidfVectorizer(
    tokenizer=custom_tokenizer,
    ngram_range=(1, 2),
    max_features=100
)

In [7]:
def preprocess_text( text):
    normalized_text = normalizer.normalize(text)
    tokens = custom_tokenizer(normalized_text)
    return " ".join(tokens)

In [8]:
def extract_dialogues( text):
    dialogues = []
    lines = text.strip().split('\n')
    
    for line in lines:
        line = line.strip()
        if line.startswith('RA:'):
            speaker = 'Recovery_Agent'
            content = line[3:].strip()
        elif line.startswith('B:'):
            speaker = 'Borrower'
            content = line[2:].strip()
        else:
            continue
        processed_content = preprocess_text(content)
        
        dialogues.append({
            'speaker': speaker,
            'original_content': content,
            'processed_content': processed_content
        })
    
    return pd.DataFrame(dialogues)

In [9]:
def analyze_sentiment(text):
    """Analyze sentiment of Hindi text."""
    tokens = custom_tokenizer(text)
    
    # Count sentiment words
    sentiment_counts = {
        'positive': sum(1 for token in tokens if token in sentiment_lexicon['positive']),
        'negative': sum(1 for token in tokens if token in sentiment_lexicon['negative']),
        'neutral': sum(1 for token in tokens if token in sentiment_lexicon['neutral'])
    }
    
    total_sentiment_words = sum(sentiment_counts.values())
    if total_sentiment_words == 0:
        sentiment_score = 0
    else:
        sentiment_score = (sentiment_counts['positive'] - sentiment_counts['negative']) / total_sentiment_words
    
    # Determine sentiment label
    if sentiment_score > 0.1:
        sentiment = 'positive'
    elif sentiment_score < -0.1:
        sentiment = 'negative'
    else:
        sentiment = 'neutral'
        
    return {
        'sentiment': sentiment,
        'score': sentiment_score,
    }

In [10]:
def generate_summary(dialogue_df: pd.DataFrame):
    total_turns = len(dialogue_df)
    speaker_counts = dialogue_df['speaker'].value_counts()
    
    _, importance_scores = analyze_with_tfidf(dialogue_df['processed_content'])
    top_topics = importance_scores.head(3)['term'].tolist()
    
    sentiment_by_speaker = {}
    for speaker in dialogue_df['speaker'].unique():
        speaker_content = dialogue_df[dialogue_df['speaker'] == speaker]['original_content']
        speaker_sentiments = [analyze_sentiment(content)['score'] for content in speaker_content]
        avg_sentiment = sum(speaker_sentiments) / len(speaker_sentiments)
        
        if avg_sentiment > 0.2:
            sentiment_by_speaker[speaker] = "Positive"
        elif avg_sentiment < -0.2:
            sentiment_by_speaker[speaker] = "Negative"
        else:
            sentiment_by_speaker[speaker] = "Neutral"

    summary_points = []
    for idx, row in dialogue_df.iterrows():
        sentiment = analyze_sentiment(row['original_content'])
        if sentiment['score'] > 0.2 or sentiment['score'] < -0.2:
            summary_points.append({
                'speaker': row['speaker'],
                'content': row['original_content'],
                'sentiment': sentiment['sentiment']
            })
    
    return {
        'statistics': {
            'total_turns': total_turns,
            'speaker_distribution': speaker_counts.to_dict(),
            'main_topics': top_topics
        },
        'sentiment_by_speaker': sentiment_by_speaker,
        'key_points': summary_points[:5]
    }


In [11]:
def analyze_with_tfidf( texts):
    tfidf_matrix = tfidf.fit_transform(texts)
    
    feature_names = tfidf.get_feature_names_out()
    
    tfidf_df = pd.DataFrame(
        tfidf_matrix.toarray(),
        columns=feature_names
    )
    importance_scores = pd.DataFrame({
        'term': feature_names,
        'score': tfidf_matrix.sum(axis=0).A1
    })
    importance_scores = importance_scores.sort_values('score', ascending=False)
    return tfidf_df, importance_scores

In [12]:
def categorize_terms( importance_scores):
    categorized_terms = defaultdict(list)
    
    for _, row in importance_scores.iterrows():
        term = row['term']
        score = row['score']
    
        for category, keywords in categories.items():
            if any(keyword in term for keyword in keywords):
                categorized_terms[category].append({
                    'term': term,
                    'score': score
                })
                break
        
    return categorized_terms

In [13]:
def extract_key_phrases( importance_scores, threshold=0.1):
    key_phrases = []
    
    for _, row in importance_scores.iterrows():
        if row['score'] > threshold:
            key_phrases.append({
                'phrase': row['term'],
                'importance': row['score']
            })
    return key_phrases

In [24]:
def analyze_conversation( conversation_text: str) -> Dict[str, Any]:
    df = extract_dialogues(conversation_text)
    tfidf_df, importance_scores = analyze_with_tfidf(df['processed_content'])
    categorized_terms = categorize_terms(importance_scores)
    key_phrases = extract_key_phrases(importance_scores)
    summary = generate_summary(df)
    sentiment_analysis = []
    for _, row in df.iterrows():
        sentiment = analyze_sentiment(row['original_content'])
        sentiment_analysis.append({
            'speaker': row['speaker'],
            'content': row['original_content'],
            'sentiment': sentiment
        })
    return {
        'dialogue_analysis': df.to_dict('records'),
        'categorized_terms': categorized_terms,
        'key_phrases': key_phrases,
        'summary': summary,
        'sentiment_analysis': sentiment_analysis
    }

In [25]:
results = analyze_conversation(conversation)



## Summary

In [26]:
def generate_comprehensive_summary(results):
    categorized_terms = results.get('categorized_terms', {})
    key_phrases = results.get('key_phrases', [])
    sentiment_analysis = results.get('sentiment_analysis', [])
    summary_stats = results.get('summary', {}).get('statistics', {})
    dialogue_analysis = results.get('dialogue_analysis', [])
    summary_parts = []
    total_turns = summary_stats.get('total_turns', 0)
    speaker_distribution = summary_stats.get('speaker_distribution', {})
    if total_turns > 0:
        summary_parts.append(f"The conversation is a {total_turns}-turn dialogue between a recovery agent and a borrower.")
        if speaker_distribution:
            for speaker, turns in speaker_distribution.items():
                participation_percentage = (turns / total_turns) * 100
                summary_parts.append(f"{speaker} contributed to {participation_percentage:.2f}% of the conversation.")
    
    if 'payment' in categorized_terms:
        payment_terms = categorized_terms['payment']
        summary_parts.append("The primary focus was on resolving outstanding financial obligations.")
        
        payment_phrases = [term['term'] for term in payment_terms]
        if payment_phrases:
            summary_parts.append(f"Key payment-related terms included: {', '.join(payment_phrases)}.")
    
    if 'problem' in categorized_terms:
        problem_terms = categorized_terms['problem']
        problem_phrases = [term['term'] for term in problem_terms]
        summary_parts.append("The conversation revealed significant financial challenges.")
        
        if problem_phrases:
            summary_parts.append(f"Specific problem indicators: {', '.join(problem_phrases)}.")
    
    negative_sentiments = [turn for turn in sentiment_analysis if turn['sentiment']['sentiment'] == 'negative']
    positive_sentiments = [turn for turn in sentiment_analysis if turn['sentiment']['sentiment'] == 'positive']
    
    if negative_sentiments or positive_sentiments:
        sentiment_summary = "The conversation's emotional landscape was characterized by "
        sentiment_components = []
        
        if negative_sentiments:
            sentiment_components.append(f"{len(negative_sentiments)} moments of tension or concern")
        
        if positive_sentiments:
            sentiment_components.append(f"{len(positive_sentiments)} instances of constructive dialogue")
        
        summary_parts.append(sentiment_summary + " and ".join(sentiment_components) + ".")
    
    if 'deadline' in categorized_terms:
        deadline_terms = categorized_terms['deadline']
        deadline_phrases = [term['term'] for term in deadline_terms]
        summary_parts.append("Significant emphasis was placed on timing and scheduled resolutions.")
        
        if deadline_phrases:
            summary_parts.append(f"Key timing-related terms: {', '.join(deadline_phrases)}.")
    
    if 'agreement' in categorized_terms:
        agreement_terms = categorized_terms['agreement']
        agreement_phrases = [term['term'] for term in agreement_terms]
        summary_parts.append("The dialogue culminated in a mutually acceptable resolution strategy.")
        
        if agreement_phrases:
            summary_parts.append(f"Resolution indicators: {', '.join(agreement_phrases)}.")
    
    # Key Phrases Interpretation
    if key_phrases:
        top_phrases = [phrase['phrase'] for phrase in key_phrases[:3]]
        summary_parts.append(f"Critical discussion anchors: {', '.join(top_phrases)}.")
    
    # Concluding Insight
    final_summary = " ".join(summary_parts)
    final_summary += " The interaction exemplifies a nuanced approach to financial negotiation, balancing institutional requirements with individual circumstances."
    
    return final_summary

comprehensive_summary = generate_comprehensive_summary(results)
print(comprehensive_summary)

The conversation is a 16-turn dialogue between a recovery agent and a borrower. Recovery_Agent contributed to 50.00% of the conversation. Borrower contributed to 50.00% of the conversation. The primary focus was on resolving outstanding financial obligations. Key payment-related terms included: पसै पैसे, पैसे नहीं, पैसे. The conversation revealed significant financial challenges. Specific problem indicators: समस्या. The conversation's emotional landscape was characterized by 5 moments of tension or concern and 5 instances of constructive dialogue. Significant emphasis was placed on timing and scheduled resolutions. Key timing-related terms: अगले हफ्ते, हफ्ते, समय, महीने, अगले महीने, हफ्ते कुछ, समय मिल, पिछले महीने, समय करना, भगुतान समय, हफ्ते कर, समय भगुतान, हफ्ते आधा. The dialogue culminated in a mutually acceptable resolution strategy. Resolution indicators: ठीक, स्वीकार. Critical discussion anchors: अलविदा, भगुतान, ठीक. The interaction exemplifies a nuanced approach to financial neg

## Key points

In [27]:
print("\nKey Points:")
for point in results['summary']['key_points']:
    print(f"- {point['speaker']}: {point['content']}")


Key Points:
- Recovery_Agent: सर, आपका पिछले महीने का EMI अभी तक नहीं आया है । क्या कोई समस्या है?
- Recovery_Agent: ठीक है , आधा भगुतान अगले हफ्ते तक कर दीजिए । बाकी का क्या प्लान है आपका ?
- Recovery_Agent: ठीक है । तो हम ऐसा करते हैं आप अगले हफ्ते तक आधा EMI जमा कर दीजिए, और अगले महीने के 15 तारीख तक बाकी का भगुतान कर दीजि ए। क्या यह आपको स्वीकार है ?
- Recovery_Agent: बहुत अच्छा । मैं आपको एक SMS भेज रहा हूं जिसमें भगुतान की डिटेल्स होंगी । कृपया इसका पालन करें और समय पर भगुतान करें ।
- Borrower: हां, थोड़ी दिक्कत है। मेरी नौकरी चली गई है और मैं नया काम ढूंढ रहा हूं।


## Sentiment analysis

In [28]:

print("\nSentiment Analysis:")
for turn in results['sentiment_analysis']:
    print(f"{turn['speaker']}: {turn['content']} -> {turn['sentiment']['sentiment']}")
print(f"Overall sentiment: {results['summary']['sentiment_by_speaker']}")


Sentiment Analysis:
Recovery_Agent: नमस्ते श्री कुमार, मैं एक्स वाई जेड फाइनेंस से बोल रहा हूं । आपके लोन के बारे में बात करनी थी । -> neutral
Recovery_Agent: सर, आपका पिछले महीने का EMI अभी तक नहीं आया है । क्या कोई समस्या है? -> negative
Recovery_Agent: ओह , यह तो बरुा हुआ। लेकिन सर , आपको समझना होगा कि लोन का भगुतान समय पर करना बहुत जरूरी है । -> neutral
Recovery_Agent: हम समझते हैं आपकी स्थिति । क्या आप अगले हफ्ते तक कुछ भगुतान कर सकते है ? -> neutral
Recovery_Agent: ठीक है , आधा भगुतान अगले हफ्ते तक कर दीजिए । बाकी का क्या प्लान है आपका ? -> positive
Recovery_Agent: ठीक है । तो हम ऐसा करते हैं आप अगले हफ्ते तक आधा EMI जमा कर दीजिए, और अगले महीने के 15 तारीख तक बाकी का भगुतान कर दीजि ए। क्या यह आपको स्वीकार है ? -> positive
Recovery_Agent: बहुत अच्छा । मैं आपको एक SMS भेज रहा हूं जिसमें भगुतान की डिटेल्स होंगी । कृपया इसका पालन करें और समय पर भगुतान करें । -> positive
Recovery_Agent: आपका स्वागत है । अगर कोई और सवाल हो तो मुझे बताइएगा। अलविदा । -> neutral
Borrower: हां, बोलिए। क्य