In [1]:
# notebooks/03_scenario_analysis.ipynb

import pandas as pd
import os
import sys
import matplotlib.pyplot as plt
import seaborn as sns

# Add the 'scripts' directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir, 'scripts')))
from scripts.utils import preprocess_text # For manual keyword checks

# Configure plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 12

# Load the analyzed data
processed_data_dir = os.path.join(os.path.abspath(''), os.pardir, 'data', 'processed')
input_filepath = os.path.join(processed_data_dir, 'fintech_app_reviews_analyzed.csv')

if not os.path.exists(input_filepath):
    print(f"Error: Analyzed data file not found at {input_filepath}. Please run 02_sentiment_thematic_analysis.ipynb first.")
    df_analyzed = pd.DataFrame() # Create an empty DataFrame to avoid errors
else:
    df_analyzed = pd.read_csv(input_filepath)
    print(f"Loaded {len(df_analyzed)} analyzed reviews.")
    
    # Ensure 'Processed_Reviews_Tokens' and 'Extracted_Keywords' are parsed as lists
    # They were stored as stringified lists in CSV, need to convert back
    import ast # For safely evaluating string representations of lists
    df_analyzed['Processed_Reviews_Tokens'] = df_analyzed['Processed_Reviews_Tokens'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])
    df_analyzed['Extracted_Keywords'] = df_analyzed['Extracted_Keywords'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])


print("\n--- Scenario 1: Retaining Users (Slow Loading during Transfers) ---")
# CBE has 4.4, BOA 2.8, Dashen 4.0
# Users complain about slow loading during transfers. Analyze if this is a broader issue.

# Keywords related to slow loading/transfers
slow_transfer_keywords = ['slow', 'loading', 'lag', 'transfer', 'send money', 'delay', 'time', 'network']

# Filter reviews containing these keywords (case-insensitive search in original text)
slow_transfer_reviews = df_analyzed[
    df_analyzed['Review Text'].str.contains('|'.join(slow_transfer_keywords), case=False, na=False)
].copy()

if not slow_transfer_reviews.empty:
    print(f"\nTotal reviews mentioning slow loading/transfers: {len(slow_transfer_reviews)}")

    # Sentiment distribution for these reviews
    plt.figure(figsize=(8, 5))
    sns.countplot(data=slow_transfer_reviews, x='Sentiment', palette='coolwarm', order=['Negative', 'Neutral', 'Positive'])
    plt.title('Sentiment of "Slow Loading/Transfer" Related Reviews')
    plt.xlabel('Sentiment')
    plt.ylabel('Number of Reviews')
    plt.show()

    # Breakdown by bank
    print("\nSlow Loading/Transfer Complaints by Bank:")
    bank_complaints = slow_transfer_reviews.groupby('Bank/App Name')['Sentiment'].value_counts().unstack(fill_value=0)
    bank_complaints['Total'] = bank_complaints.sum(axis=1)
    bank_complaints['Negative_Ratio'] = bank_complaints['Negative'] / bank_complaints['Total']
    print(bank_complaints.sort_values(by='Negative_Ratio', ascending=False))

    plt.figure(figsize=(10, 6))
    sns.barplot(data=bank_complaints.reset_index(), x='Bank/App Name', y='Negative_Ratio', palette='viridis')
    plt.title('Ratio of Negative Sentiment for "Slow Loading/Transfer" Reviews by Bank')
    plt.xlabel('Bank/App Name')
    plt.ylabel('Ratio of Negative Reviews')
    plt.show()

    # Insights and Recommendations for Scenario 1
    print("\n**Scenario 1 Insights & Recommendations:**")
    print("The analysis of 'slow loading/transfer' complaints shows:")
    print(f"- Overall, {slow_transfer_reviews['Sentiment'].value_counts().get('Negative', 0)} out of {len(slow_transfer_reviews)} reviews mentioning these issues are negative, indicating a significant pain point.")
    print("- While CBE has a high overall rating, it still receives complaints about this issue, suggesting underlying technical challenges that affect user experience even for its generally satisfied users.")
    print("- BOA shows the highest ratio of negative sentiment related to this theme, aligning with its lower overall rating, implying this is a critical area for improvement.")
    print("\nRecommendations:")
    print("1. **Deep Technical Audit:** All banks, especially BOA, should conduct a deep technical audit of their transaction processing systems, API response times, and network infrastructure, particularly during peak hours.")
    print("2. **Targeted Performance Optimization:** Focus on optimizing the performance of critical paths like money transfers and balance inquiries. Implement caching mechanisms and optimize database queries.")
    print("3. **Proactive Communication:** Inform users about known performance issues and when fixes are deployed. Implement in-app progress indicators for transfers to manage user expectations.")
else:
    print("No reviews found mentioning 'slow loading' or 'transfer' related issues.")

print("\n--- Scenario 2: Enhancing Features ---")
# Extract desired features (e.g., transfer, fingerprint login, faster loading times)
# Recommend how each bank can stay competitive.

# Filter for reviews mentioning feature requests (positive sentiment for new features)
# Or reviews that directly ask for a feature (e.g., "I wish it had...")
feature_request_keywords = ['feature', 'add', 'wish', 'want', 'suggest', 'improve', 'need', 'require', 'update']

# Analyze sentiment for reviews where 'Identified_Theme' is 'Feature Requests'
feature_reviews = df_analyzed[df_analyzed['Identified_Theme'].str.contains('Feature Requests', na=False)].copy()

if not feature_reviews.empty:
    print(f"\nTotal reviews categorized as Feature Requests: {len(feature_reviews)}")

    # Extract common keywords/n-grams from feature request reviews
    all_feature_request_words = [word for sublist in feature_reviews['Processed_Reviews_Tokens'] for word in sublist]
    feature_word_freq = pd.Series(all_feature_request_words).value_counts().head(20)

    print("\nTop 20 Keywords in Feature Request Reviews:")
    print(feature_word_freq)

    plt.figure(figsize=(12, 7))
    sns.barplot(x=feature_word_freq.index, y=feature_word_freq.values, palette='crest')
    plt.title('Top Keywords in Feature Request Reviews')
    plt.xlabel('Keyword')
    plt.ylabel('Frequency')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

    # Breakdown of desired features by bank
    print("\nCommon Feature Mentions by Bank:")
    
    # We can refine this by looking at specific feature-related keywords within each bank's 'Feature Requests'
    desired_features_map = {
        'fingerprint login': ['fingerprint', 'biometric', 'faceid', 'login'],
        'faster loading': ['fast', 'speed', 'loading', 'quick'],
        'bill payment': ['bill', 'pay', 'utility'],
        'P2P transfer': ['p2p', 'peer to peer'],
        'budgeting tools': ['budget', 'expense', 'track', 'spending']
    }

    feature_bank_counts = {}
    for bank in df_analyzed['Bank/App Name'].unique():
        bank_feature_counts = {}
        bank_feature_reviews = df_analyzed[(df_analyzed['Bank/App Name'] == bank) & (df_analyzed['Identified_Theme'].str.contains('Feature Requests', na=False))]
        
        bank_text = ' '.join(bank_feature_reviews['Review Text'].dropna().tolist())
        
        for feature, keywords in desired_features_map.items():
            count = sum(bank_text.lower().count(kw) for kw in keywords)
            if count > 0:
                bank_feature_counts[feature] = count
        feature_bank_counts[bank] = bank_feature_counts

    feature_comparison_df = pd.DataFrame(feature_bank_counts).fillna(0).T
    print(feature_comparison_df)

    # Insights and Recommendations for Scenario 2
    print("\n**Scenario 2 Insights & Recommendations:**")
    print("Common feature requests across all banks include fingerprint login, faster loading, and budgeting tools.")
    print("Recommendations:")
    print("1. **Prioritize Biometric Login:** Given the prevalence of 'fingerprint' keywords, all banks should prioritize implementing or improving biometric login (fingerprint/face ID) for enhanced security and convenience.")
    print("2. **Focus on Speed:** 'Faster loading' is a recurring request. This reinforces the need for ongoing performance optimization, as discussed in Scenario 1.")
    print("3. **Innovation for Competitive Edge:**")
    print("   - **CBE:** Leverage its strong user base. Consider introducing advanced features like in-app budgeting tools or personalized financial advice to deepen engagement and retention.")
    print("   - **BOA:** Focus on core functionality first, ensuring existing features are robust and performant. Then, add highly requested features like biometric login to address immediate user needs and build trust.")
    print("   - **Dashen Bank:** Explore niche features like QR code payments or specific bill payment integrations if user feedback points to these, to differentiate itself in the market.")
else:
    print("No reviews found categorized as 'Feature Requests'.")

print("\n--- Scenario 3: Managing Complaints ---")
# Cluster and track complaints (e.g., “login error”) to guide AI chatbot integration.

# Focus on negative reviews for complaint clustering
negative_reviews = df_analyzed[df_analyzed['Sentiment'] == 'Negative'].copy()

if not negative_reviews.empty:
    print(f"\nTotal Negative Reviews for Complaint Analysis: {len(negative_reviews)}")

    # Use 'Identified_Theme' as initial clusters for complaints
    print("\nTop Complaint Themes (from Negative Reviews):")
    complaint_theme_counts = negative_reviews['Identified_Theme'].value_counts()
    print(complaint_theme_counts)

    plt.figure(figsize=(10, 6))
    sns.barplot(x=complaint_theme_counts.index, y=complaint_theme_counts.values, palette='Reds_d')
    plt.title('Top Complaint Themes in Negative Reviews')
    plt.xlabel('Complaint Theme')
    plt.ylabel('Number of Negative Reviews')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

    # Identify top keywords in critical themes, e.g., 'Account Access Issues'
    if 'Account Access Issues' in complaint_theme_counts.index:
        access_issue_reviews = negative_reviews[negative_reviews['Identified_Theme'] == 'Account Access Issues']
        all_access_words = [word for sublist in access_issue_reviews['Processed_Reviews_Tokens'] for word in sublist]
        access_word_freq = pd.Series(all_access_words).value_counts().head(10)
        print("\nTop Keywords in 'Account Access Issues' (Negative Reviews):")
        print(access_word_freq)
        
        plt.figure(figsize=(8, 5))
        sns.barplot(x=access_word_freq.index, y=access_word_freq.values, palette='Blues_d')
        plt.title('Top Keywords in Negative "Account Access Issues"')
        plt.xlabel('Keyword')
        plt.ylabel('Frequency')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()

    # Insights and Recommendations for Scenario 3
    print("\n**Scenario 3 Insights & Recommendations:**")
    print("The dominant complaint themes, especially 'Account Access Issues' and 'Transaction Performance', indicate areas ripe for support automation.")
    print("\nRecommendations for AI Chatbot Integration and Faster Support:")
    print("1. **Chatbot Training Data:** Use the identified complaint themes and their associated negative review texts (e.g., reviews related to 'login error', 'transaction failed', 'app crash') as direct training data for the AI chatbot's NLU (Natural Language Understanding) module.")
    print("2. **Automated FAQs & Troubleshooting:** Develop automated responses and step-by-step troubleshooting guides for common, high-volume issues like 'login error' or 'OTP not received'. The chatbot should guide users through these processes.")
    print("3. **Smart Routing for Complex Issues:** For themes like 'Complex transaction disputes' or 'Security concerns' (which might emerge from more advanced clustering), the chatbot should be trained to identify these and immediately escalate to a human agent, providing the agent with the chat transcript and relevant user data.")
    print("4. **Proactive Issue Detection:** Monitor real-time incoming reviews for spikes in specific complaint keywords. If a sudden increase in 'login error' complaints is detected, it could signal a system-wide issue, prompting immediate investigation by engineering teams.")
else:
    print("No negative reviews found for complaint analysis.")

ModuleNotFoundError: No module named 'scripts.utils'