In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        os.path.join(dirname, filename)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%pip install -U openai-whisper

In [None]:
%pip install jiwer

# Niranjan's 

## OpenAI - Whisper Model Loading and Transcribing

In [None]:
import whisper
import torch
import os
import pandas as pd
import numpy as np
from jiwer import wer
from transformers import MarianMTModel, MarianTokenizer
import re

# Ensure GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load Whisper model on GPU
model = whisper.load_model("large").to(device)

# Define dataset path
audio_folder = "/kaggle/input/voice1-niranjan/CodeSwitchDataset/CodeSwitch_Dataset_wav"  # Change to actual path
output_csv = "transcriptions.csv"
data = []  # Store transcriptions

# Process each .wav file in the folder
for idx, file in enumerate(sorted(os.listdir(audio_folder))):
    if file.endswith(".wav"):  # Ensure only .wav files are processed
        audio_path = os.path.join(audio_folder, file)
        
        # Transcribe audio
        result = model.transcribe(audio_path, fp16=True)
        transcribed_text = result["text"]
        
        # Append to list for CSV
        data.append([idx+1, file, transcribed_text])
        print(f"Processed {file}.")

# Save all transcriptions to CSV
whisper_df = pd.DataFrame(data, columns=["SNo", "FileName", "Transcription"])
whisper_df.to_csv(output_csv, index=False)
print("All transcriptions saved to CSV.")

In [None]:
transcribe_df=pd.read_csv("/kaggle/working/transcriptions.csv")
for i in transcribe_df['Transcription']:
    print(i)

## COMPARING TRANSCRIPTIONS AND CALCULATING WER

In [None]:
test_df=pd.read_csv("/kaggle/input/voice1-niranjan/CodeSwitchDataset/CodeSwitch_36.csv")
for i in test_df['Transcription']:
    print(i)

In [None]:
# ============== PART 2: COMPARING TRANSCRIPTIONS AND CALCULATING WER ==============

# Load the original transcriptions
original_df = pd.read_csv("/kaggle/input/voice1-niranjan/CodeSwitchDataset/CodeSwitch_36.csv")

# Extract just the Voice number from file name for matching if needed
whisper_df['Voice_Number'] = whisper_df['FileName'].apply(
    lambda x: re.search(r'Voice[_\s]?(\d+)', x).group(1) if re.search(r'Voice[_\s]?(\d+)', x) else x
)

original_df['Voice_Number'] = original_df['FileName'].apply(
    lambda x: re.search(r'Voice[_\s]?(\d+)', x).group(1) if re.search(r'Voice[_\s]?(\d+)', x) else x
)

# Merge the two dataframes based on Voice Number
merged_df = pd.merge(
    whisper_df, 
    original_df, 
    on="Voice_Number",
    suffixes=('_whisper', '_original'),
    how="inner"
)

# Calculate WER for each transcription
wer_scores = []
for idx, row in merged_df.iterrows():
    whisper_trans = row["Transcription_whisper"]
    original_trans = row["Transcription_original"]  # Using the column name from your sample
    
    # Calculate WER
    error_rate = wer(original_trans, whisper_trans)
    wer_scores.append(error_rate)

# Add WER to the merged dataframe
merged_df["WER"] = wer_scores

# Calculate average WER
avg_wer = np.mean(wer_scores)
print(f"Average Word Error Rate: {avg_wer:.4f}")
print(f"Word Error Rate: {wer_scores}")
# Save the comparison results
merged_df.to_csv("transcription_comparison.csv", index=False)
print("Comparison saved to transcription_comparison.csv")

In [None]:
# Create visualization graphs for WER results
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

# Set aesthetic style for better visualization
plt.style.use('fivethirtyeight')
sns.set_palette("viridis")

# Create a figure with multiple subplots
fig = plt.figure(figsize=(20, 16))
fig.suptitle('Whisper Transcription Analysis: Word Error Rate (WER)', fontsize=24, y=0.95)

# 1. Distribution of WER scores
ax1 = plt.subplot(2, 2, 1)
sns.histplot(merged_df["WER"], bins=15, kde=True, ax=ax1)
ax1.set_title('Distribution of Word Error Rates', fontsize=18)
ax1.set_xlabel('WER', fontsize=14)
ax1.set_ylabel('Frequency', fontsize=14)
ax1.axvline(avg_wer/100, color='red', linestyle='--', linewidth=2, label=f'Avg WER: {avg_wer:.2f}%')
ax1.legend()

# 2. WER scores by transcription length
ax2 = plt.subplot(2, 2, 2)
merged_df['Original_Length'] = merged_df['Transcription_original'].apply(lambda x: len(x.split()))
sns.scatterplot(x='Original_Length', y='WER', data=merged_df, alpha=0.7, ax=ax2)
ax2.set_title('WER vs Original Transcription Length', fontsize=18)
ax2.set_xlabel('Number of Words in Original Transcription', fontsize=14)
ax2.set_ylabel('WER', fontsize=14)

# Add trend line
z = np.polyfit(merged_df['Original_Length'], merged_df['WER'], 1)
p = np.poly1d(z)
ax2.plot(merged_df['Original_Length'], p(merged_df['Original_Length']), 
         color='red', linestyle='--', linewidth=2)

# 3. Top 10 highest WER samples
ax3 = plt.subplot(2, 2, 3)
top_wer = merged_df.sort_values('WER', ascending=False).head(10)
colors = sns.color_palette("viridis", 10)
bars = sns.barplot(x=top_wer['Voice_Number'], y=top_wer['WER'], palette=colors, ax=ax3)
ax3.set_title('Top 10 Samples with Highest WER', fontsize=18)
ax3.set_xlabel('Voice Sample Number', fontsize=14)
ax3.set_ylabel('WER', fontsize=14)
ax3.axhline(avg_wer/100, color='red', linestyle='--', linewidth=2, label=f'Avg WER: {avg_wer:.2f}%')
ax3.legend()

# Rotate x labels for better readability
plt.setp(ax3.get_xticklabels(), rotation=45)

# 4. WER Heatmap: Create categories based on transcription length and WER
ax4 = plt.subplot(2, 2, 4)

# Create length categories
merged_df['Length_Category'] = pd.qcut(merged_df['Original_Length'], 
                                      q=5, 
                                      labels=['Very Short', 'Short', 'Medium', 'Long', 'Very Long'])

# Create WER categories
merged_df['WER_Category'] = pd.qcut(merged_df['WER'], 
                                   q=5, 
                                   labels=['Excellent', 'Good', 'Average', 'Poor', 'Very Poor'])

# Create a cross-tabulation and normalize by row
heatmap_data = pd.crosstab(merged_df['Length_Category'], 
                           merged_df['WER_Category'], 
                           normalize='index')

# Create heatmap
sns.heatmap(heatmap_data, annot=True, cmap='YlGnBu', fmt='.2f', ax=ax4)
ax4.set_title('WER Performance by Transcription Length Category', fontsize=18)
ax4.set_xlabel('WER Category', fontsize=14)
ax4.set_ylabel('Transcription Length', fontsize=14)

# Additional insightful graph: WER progression across voice samples
plt.figure(figsize=(18, 6))
plt.plot(merged_df['Voice_Number'], merged_df['WER'], marker='o', linestyle='-', color='blue', alpha=0.7)
plt.axhline(y=avg_wer/100, color='r', linestyle='--', label=f'Average WER: {avg_wer:.2f}%')
plt.title('WER Progression Across Voice Samples', fontsize=20)
plt.xlabel('Voice Sample Number', fontsize=14)
plt.ylabel('Word Error Rate (WER)', fontsize=14)
plt.grid(True, alpha=0.3)
plt.legend()

# Adjust layout for better spacing
plt.tight_layout()
plt.savefig('wer_analysis_results.png', dpi=300, bbox_inches='tight')
plt.show()

# Create an interactive visualization using plotly
try:
    import plotly.express as px
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots
    
    # Create a figure with subplots
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=(
            'WER Distribution', 
            'WER vs Transcription Length',
            'Top 10 Highest WER Samples',
            'WER by Voice Sample'
        )
    )
    
    # Add WER distribution histogram
    fig.add_trace(
        go.Histogram(x=merged_df['WER'], nbinsx=20, name='WER Distribution'),
        row=1, col=1
    )
    
    # Add scatter plot for WER vs transcription length
    fig.add_trace(
        go.Scatter(
            x=merged_df['Original_Length'],
            y=merged_df['WER'],
            mode='markers',
            marker=dict(size=8, color=merged_df['WER'], colorscale='Viridis', showscale=True),
            name='WER vs Length',
            text=merged_df['Voice_Number']
        ),
        row=1, col=2
    )
    
    # Add bar chart for top 10 highest WER
    fig.add_trace(
        go.Bar(
            x=top_wer['Voice_Number'],
            y=top_wer['WER'],
            marker_color=top_wer['WER'],
            marker=dict(colorscale='Viridis'),
            name='Top 10 Highest WER'
        ),
        row=2, col=1
    )
    
    # Add line chart for WER by sample
    sorted_df = merged_df.sort_values('Voice_Number')
    fig.add_trace(
        go.Scatter(
            x=sorted_df['Voice_Number'],
            y=sorted_df['WER'],
            mode='lines+markers',
            name='WER by Sample'
        ),
        row=2, col=2
    )
    
    # Update layout
    fig.update_layout(
        title_text='Interactive Whisper Transcription Analysis',
        height=800,
        width=1200,
        showlegend=False
    )
    
    # Add horizontal line for average WER
    for i in range(1, 3):
        for j in range(1, 3):
            fig.add_shape(
                type="line",
                x0=0,
                y0=avg_wer/100,
                x1=1,
                y1=avg_wer/100,
                xref=f"x{i}{j} domain",
                yref=f"y{i}{j}",
                line=dict(color="Red", width=2, dash="dash"),
            )
    
    # Save as HTML file
    fig.write_html('interactive_wer_analysis.html')
    print("Interactive visualization saved as interactive_wer_analysis.html")
    
    # Show the plot
    fig.show()
    
except ImportError:
    print("Plotly not available. Skipping interactive visualization.")

# Additional analysis: Language detection and WER correlation
try:
    import langdetect
    from langdetect import detect_langs
    
    # Function to determine if text contains multiple languages
    def detect_multilingual(text):
        try:
            langs = detect_langs(text)
            return len(langs) > 1, [str(lang) for lang in langs]
        except:
            return False, ["unknown"]
    
    # Apply language detection
    merged_df['Is_Multilingual'], merged_df['Languages'] = zip(*merged_df['Transcription_original'].apply(detect_multilingual))
    
    # Create plot showing WER by language composition
    plt.figure(figsize=(12, 6))
    sns.boxplot(x='Is_Multilingual', y='WER', data=merged_df)
    plt.title('WER Comparison: Monolingual vs Multilingual Content', fontsize=18)
    plt.xlabel('Contains Multiple Languages', fontsize=14)
    plt.ylabel('Word Error Rate (WER)', fontsize=14)
    plt.savefig('wer_by_language_composition.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("Language analysis completed and visualization saved.")
    
except ImportError:
    print("Language detection packages not available. Skipping language analysis.")

# Generate statistics table for summary
stats_df = pd.DataFrame({
    'Metric': ['Sample Count', 'Average WER (%)', 'Median WER (%)', 'Min WER (%)', 'Max WER (%)', 
              'Std Dev WER (%)', 'Samples Above Avg WER'],
    'Value': [
        len(merged_df),
        round(avg_wer, 2),
        round(np.median(merged_df['WER']) * 100, 2),
        round(merged_df['WER'].min() * 100, 2),
        round(merged_df['WER'].max() * 100, 2),
        round(np.std(merged_df['WER']) * 100, 2),
        sum(merged_df['WER'] > avg_wer/100)
    ]
})

# Print summary statistics
print("\n=== WER Analysis Summary ===")
print(stats_df.to_string(index=False))

# Save all results
merged_df.to_csv("wer_analysis_complete.csv", index=False)
print("\nComplete analysis results saved to wer_analysis_complete.csv")

## English Words to Tamil

In [None]:
# ============== PART 3: ENGLISH TO TAMIL CONVERSION ==============
import re
import torch
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

# Load English-Tamil translation model
model_name = "suriya7/English-to-Tamil"
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
translator = M2M100ForConditionalGeneration.from_pretrained(model_name).to(device)

# Set source language
tokenizer.src_lang = "en"

# Technical terms that should be preserved (can be expanded)
technical_terms = ["battery life", "laptop", "CPU", "GPU", "RAM", "SSD", "HDD", "Samsung", "Apple"]

# Function to detect English words and convert them to Tamil
def convert_english_to_tamil(text):
    # First, identify technical terms to preserve
    preserved_terms = {}
    for term in technical_terms:
        if term.lower() in text.lower():
            # Create a unique placeholder
            placeholder = f"__TECH_TERM_{len(preserved_terms)}__"
            # Case-insensitive replacement
            pattern = re.compile(re.escape(term), re.IGNORECASE)
            text = pattern.sub(placeholder, text)
            preserved_terms[placeholder] = term
    
    # More sophisticated English word detection
    # This regex pattern looks for words with Latin characters
    english_pattern = re.compile(r'\b[a-zA-Z]+\b')
    english_words = english_pattern.findall(text)
    
    # If we have English words to translate
    if english_words:
        try:
            # Group words into phrases for better translation
            chunks = []
            current_chunk = []
            
            for word in english_words:
                if len(current_chunk) < 5:  # Limit chunk size
                    current_chunk.append(word)
                else:
                    chunks.append(" ".join(current_chunk))
                    current_chunk = [word]
            
            if current_chunk:
                chunks.append(" ".join(current_chunk))
            
            # Translate chunks
            tamil_chunks = []
            for chunk in chunks:
                try:
                    # Prepare for translation
                    encoded = tokenizer(chunk, return_tensors="pt").to(device)
                    
                    # Generate translation
                    with torch.no_grad():
                        translated = translator.generate(**encoded, forced_bos_token_id=tokenizer.get_lang_id("ta"))
                    
                    # Convert the generated tokens to text
                    tamil_text = tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
                    tamil_chunks.append(tamil_text)
                except Exception as e:
                    print(f"Error translating chunk '{chunk}': {e}")
                    tamil_chunks.append(chunk)  # Keep original if translation fails
            
            # Create a mapping of English to Tamil words/phrases
            translation_map = {}
            for en_word in english_words:
                # Find the best translation for this word from the chunks
                for chunk, tamil_chunk in zip(chunks, tamil_chunks):
                    if en_word.lower() in chunk.lower():
                        # Try to match the word position
                        en_words_in_chunk = chunk.lower().split()
                        ta_words_in_chunk = tamil_chunk.split()
                        
                        if len(en_words_in_chunk) == len(ta_words_in_chunk):
                            # Direct positional mapping
                            try:
                                pos = en_words_in_chunk.index(en_word.lower())
                                translation_map[en_word] = ta_words_in_chunk[pos]
                            except:
                                # Fallback to whole chunk if exact position isn't found
                                translation_map[en_word] = tamil_chunk
                        else:
                            # If we can't do positional mapping, use whole chunk
                            translation_map[en_word] = tamil_chunk
                        
                        break
            
            # Replace English words with Tamil in the original text
            for en_word in sorted(english_words, key=len, reverse=True):
                if en_word in translation_map:
                    pattern = re.compile(r'\b' + re.escape(en_word) + r'\b')
                    text = pattern.sub(translation_map[en_word], text)
            
            # Restore preserved technical terms
            for placeholder, term in preserved_terms.items():
                text = text.replace(placeholder, term)
            
            return text, translation_map
        
        except Exception as e:
            print(f"Translation error: {e}")
            
            # Restore preserved technical terms on error
            for placeholder, term in preserved_terms.items():
                text = text.replace(placeholder, term)
                
            return text, {}
    
    return text, {}

# Apply the conversion to the whisper transcriptions
tamil_transcriptions = []
english_words_found = []

for idx, row in whisper_df.iterrows():
    trans_text = row["Transcription"]
    tamil_text, eng_to_tamil_map = convert_english_to_tamil(trans_text)
    
    tamil_transcriptions.append(tamil_text)
    english_words_found.append(", ".join([f"{eng} → {tam}" for eng, tam in eng_to_tamil_map.items()]))

# Add the converted transcriptions to the dataframe
whisper_df["Tamil_Transcription"] = tamil_transcriptions
whisper_df["English_Words_Converted"] = english_words_found

# Save the results
whisper_df.to_csv("tamil_transcriptions.csv", index=False)
print("Tamil conversions saved to tamil_transcriptions.csv")

In [None]:
tamil_converted_df=pd.read_csv('/kaggle/working/tamil_transcriptions.csv')
for i in tamil_converted_df['Tamil_Transcription']:
    print(i)

## WAV2VEC2

In [None]:
import soundfile as sf
import torch
import librosa
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import pandas as pd
from tqdm.auto import tqdm
import os

# Load pretrained model once
processor = Wav2Vec2Processor.from_pretrained("addy88/wav2vec2-tamil-stt")
model = Wav2Vec2ForCTC.from_pretrained("addy88/wav2vec2-tamil-stt")

def transcribe_audio(audio_path):
    try:
        # Get the target sampling rate required by the model
        target_sampling_rate = 16000
        
        # Load audio with librosa (which handles resampling better)
        audio_input, sample_rate = librosa.load(audio_path, sr=target_sampling_rate)
        
        # Process audio with the correct sampling rate
        input_values = processor(audio_input, sampling_rate=target_sampling_rate, return_tensors="pt").input_values
        
        # Get predictions
        logits = model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        
        # Decode to text
        transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
        return transcription
    except Exception as e:
        return f"Error: {str(e)}"

# Make sure you have the dirname and filenames variables defined correctly
# Use the audio folder path from your previous cells
audio_folder = "/kaggle/input/voice1-niranjan/CodeSwitchDataset/CodeSwitch_Dataset_wav"
filenames = [f for f in os.listdir(audio_folder) if f.endswith(".wav")]

# Process all files and save transcriptions
results = []
for file in tqdm(filenames, desc="Transcribing files"):
    file_path = os.path.join(audio_folder, file)
    transcription = transcribe_audio(file_path)
    results.append({"Filename": file, "Transcription": transcription})
    
# Save to CSV
df_results = pd.DataFrame(results)
output_csv = "wav2vec2_tamil_transcriptions.csv"
df_results.to_csv(output_csv, index=False)

print(f"Transcriptions saved to {output_csv}")
# Display sample results
df_results.head()

In [None]:
wav2vec2_df=pd.read_csv('/kaggle/working/wav2vec2_tamil_transcriptions.csv')
for i in wav2vec2_df['Transcription']:
    print(i)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# Import seaborn without the warning
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
import seaborn as sns

# For demonstration, let's create some sample data
np.random.seed(42)
sample_size = 100
voice_numbers = [f"Voice_{i}" for i in range(1, sample_size+1)]
wer_values = np.random.beta(2, 5, sample_size)  # Generate WER values between 0 and 1
transcription_lengths = np.random.randint(10, 200, sample_size)
avg_wer = np.mean(wer_values) * 100

merged_df = pd.DataFrame({
    'Voice_Number': voice_numbers,
    'WER': wer_values,
    'Transcription_original': [f"Sample text {i}" * (length // 10) for i, length in enumerate(transcription_lengths)],
    'Original_Length': transcription_lengths
})

# Create length categories
merged_df['Length_Category'] = pd.qcut(merged_df['Original_Length'], 
                                      q=5, 
                                      labels=['Very Short', 'Short', 'Medium', 'Long', 'Very Long'])

# Create WER categories
merged_df['WER_Category'] = pd.qcut(merged_df['WER'], 
                                   q=5, 
                                   labels=['Excellent', 'Good', 'Average', 'Poor', 'Very Poor'])

# Get top 10 highest WER samples
top_wer = merged_df.sort_values('WER', ascending=False).head(10)

# Define futuristic color palette
primary_color = '#3a86ff'  # Vibrant blue
secondary_color = '#ff006e'  # Neon pink
accent_color = '#8338ec'  # Purple
success_color = '#06d6a0'  # Teal
warning_color = '#ffbe0b'  # Amber

# Modern light background
bg_color = '#ffffff'
grid_color = '#f8f9fa'
text_color = '#212529'

# Create a more comprehensive interactive dashboard with futuristic styling
fig = make_subplots(
    rows=3, cols=2,
    specs=[
        [{"type": "histogram"}, {"type": "scatter"}],
        [{"type": "bar"}, {"type": "scatter"}],
        [{"colspan": 2, "type": "heatmap"}, None],
    ],
    subplot_titles=(
        '<span style="font-family: \'Segoe UI\', Arial, sans-serif; font-weight: 300;">WER Distribution</span>', 
        '<span style="font-family: \'Segoe UI\', Arial, sans-serif; font-weight: 300;">WER vs Transcription Length</span>',
        '<span style="font-family: \'Segoe UI\', Arial, sans-serif; font-weight: 300;">Top 10 Highest WER Samples</span>',
        '<span style="font-family: \'Segoe UI\', Arial, sans-serif; font-weight: 300;">WER by Voice Sample</span>',
        '<span style="font-family: \'Segoe UI\', Arial, sans-serif; font-weight: 300;">WER Performance Matrix</span>'
    ),
    vertical_spacing=0.12,
    horizontal_spacing=0.08
)

# 1. Futuristic WER distribution histogram with gradient
hist_data = go.Histogram(
    x=merged_df['WER'],
    nbinsx=20,
    name='WER Distribution',
    marker=dict(
        color=merged_df['WER'],
        colorscale=[[0, primary_color], [1, secondary_color]],
        opacity=0.8,
        line=dict(color='white', width=0.5)
    ),
    hovertemplate='<span style="font-family: \'Segoe UI\', sans-serif;">WER: %{x:.4f}<br>Count: %{y}</span><extra></extra>'
)
fig.add_trace(hist_data, row=1, col=1)

# Add KDE curve - calculate manually to avoid seaborn warning
from scipy import stats
kde_x = np.linspace(0, merged_df['WER'].max(), 100)
kde_y = stats.gaussian_kde(merged_df['WER'])(kde_x)
fig.add_trace(
    go.Scatter(
        x=kde_x,
        y=kde_y * (merged_df['WER'].count() / 5),  # Scale to match histogram
        mode='lines',
        line=dict(
            color=secondary_color,
            width=3,
            shape='spline',
            smoothing=1.3
        ),
        name='Density',
        hoverinfo='skip'
    ),
    row=1, col=1
)

# 2. Modern scatter plot for WER vs transcription length
fig.add_trace(
    go.Scatter(
        x=merged_df['Original_Length'],
        y=merged_df['WER'],
        mode='markers',
        marker=dict(
            size=10,
            color=merged_df['WER'],
            colorscale=[
                [0, success_color],
                [0.5, warning_color],
                [1, secondary_color]
            ],
            showscale=True,
            colorbar=dict(
                title="<span style='font-family: \"Segoe UI\", sans-serif; font-weight: 300;'>WER</span>",
                thickness=15,
                len=0.5,
                y=0.8,
                yanchor="top",
                tickfont=dict(family="Segoe UI, sans-serif"),
                titlefont=dict(family="Segoe UI, sans-serif")
            ),
            opacity=0.8,
            symbol='circle'
        ),
        name='WER vs Length',
        text=merged_df['Voice_Number'],
        hovertemplate='<span style="font-family: \'Segoe UI\', sans-serif; font-weight: 300;"><b>%{text}</b><br>Length: %{x} words<br>WER: %{y:.4f}</span><extra></extra>'
    ),
    row=1, col=2
)

# Add trend line with gradient
z = np.polyfit(merged_df['Original_Length'], merged_df['WER'], 1)
p = np.poly1d(z)
x_range = np.linspace(merged_df['Original_Length'].min(), merged_df['Original_Length'].max(), 100)
fig.add_trace(
    go.Scatter(
        x=x_range,
        y=p(x_range),
        mode='lines',
        line=dict(
            color=accent_color,
            width=3,
            dash='solid'
        ),
        name='Trend',
        hoverinfo='skip'
    ),
    row=1, col=2
)

# 3. Futuristic bar chart for top 10 highest WER
fig.add_trace(
    go.Bar(
        x=top_wer['Voice_Number'],
        y=top_wer['WER'],
        marker=dict(
            color=top_wer['WER'],
            colorscale=[
                [0, warning_color],
                [0.5, secondary_color],
                [1, '#ff0000']  # Bright red for highest values
            ],
            line=dict(color='white', width=0.5),
            opacity=0.9
        ),
        name='Top 10 Highest WER',
        hovertemplate='<span style="font-family: \'Segoe UI\', sans-serif;"><b>%{x}</b><br>WER: %{y:.4f}</span><extra></extra>'
    ),
    row=2, col=1
)

# 4. Modern line chart for WER by sample with area fill
sorted_df = merged_df.sort_values('Voice_Number')
fig.add_trace(
    go.Scatter(
        x=sorted_df['Voice_Number'],
        y=sorted_df['WER'],
        mode='lines',
        line=dict(
            color=primary_color,
            width=3,
            shape='spline',
            smoothing=1.3
        ),
        fill='tozeroy',
        fillcolor=f'rgba{tuple(int(primary_color.lstrip("#")[i:i+2], 16) for i in (0, 2, 4)) + (0.2,)}',
        name='WER by Sample',
        hovertemplate='<span style="font-family: \'Segoe UI\', sans-serif;"><b>%{x}</b><br>WER: %{y:.4f}</span><extra></extra>'
    ),
    row=2, col=2
)

# Add markers on top of the line
fig.add_trace(
    go.Scatter(
        x=sorted_df['Voice_Number'],
        y=sorted_df['WER'],
        mode='markers',
        marker=dict(
            size=6,
            color=primary_color,
            line=dict(color='white', width=1)
        ),
        showlegend=False,
        hoverinfo='skip'
    ),
    row=2, col=2
)

# 5. Futuristic heatmap for WER by transcription length category
heatmap_data = pd.crosstab(
    merged_df['Length_Category'], 
    merged_df['WER_Category'], 
    normalize='index'
)

fig.add_trace(
    go.Heatmap(
        z=heatmap_data.values,
        x=heatmap_data.columns,
        y=heatmap_data.index,
        colorscale=[
            [0, success_color],
            [0.5, warning_color],
            [1, secondary_color]
        ],
        text=np.around(heatmap_data.values, 2),
        texttemplate="%{text:.2f}",
        textfont=dict(family="Segoe UI, sans-serif", size=12, color='white'),
        hovertemplate='<span style="font-family: \'Segoe UI\', sans-serif;"><b>Length: %{y}</b><br>WER Category: %{x}<br>Proportion: %{z:.2f}</span><extra></extra>'
    ),
    row=3, col=1
)

# Add horizontal line for average WER across all applicable plots with modern styling
# FIX: Use the correct xref format for each subplot
for i in range(1, 3):
    for j in range(1, 3):
        # Correct xref and yref format
        xref = "x" if i == 1 and j == 1 else f"x{(i-1)*2+j}"
        yref = "y" if i == 1 and j == 1 else f"y{(i-1)*2+j}"
        
        fig.add_shape(
            type="line",
            x0=0,
            y0=avg_wer/100,
            x1=1,
            y1=avg_wer/100,
            xref=f"{xref} domain",
            yref=yref,
            line=dict(color=secondary_color, width=2, dash="dot"),
        )
        
        # Add futuristic annotation for average WER
        fig.add_annotation(
            x=0.95 if j == 1 else 0.05,
            y=avg_wer/100,
            xref=xref,
            yref=yref,
            text=f"Avg WER: {avg_wer:.2f}%",
            showarrow=False,
            font=dict(family="Segoe UI, sans-serif", color=secondary_color, size=12),
            bgcolor="rgba(255, 255, 255, 0.9)",
            bordercolor=secondary_color,
            borderwidth=1,
            borderpad=4,
            xanchor="right" if j == 1 else "left",
            opacity=0.9,
            align="center"
        )

# Add futuristic summary statistics at the top
stats_text = (
    f"<span style='font-family: \"Segoe UI\", sans-serif; font-weight: 300; color: {text_color};'>"
    f"<b>WER Analysis Summary</b><br>"
    f"Sample Count: <b>{len(merged_df)}</b> | "
    f"Average WER: <b style='color: {secondary_color}'>{avg_wer:.2f}%</b> | "
    f"Median WER: <b>{np.median(merged_df['WER'])*100:.2f}%</b> | "
    f"Min WER: <b style='color: {success_color}'>{merged_df['WER'].min()*100:.2f}%</b> | "
    f"Max WER: <b style='color: #ff0000'>{merged_df['WER'].max()*100:.2f}%</b> | "
    f"Std Dev: <b>{np.std(merged_df['WER'])*100:.2f}%</b>"
    f"</span>"
)

# Update layout with futuristic styling
fig.update_layout(
    title={
        'text': "<span style='font-family: \"Segoe UI\", Arial, sans-serif; font-weight: 200; letter-spacing: 1px;'>"
                "WHISPER TRANSCRIPTION ANALYSIS DASHBOARD</span>",
        'y':0.98,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        'font': {'size': 28, 'color': text_color}
    },
    annotations=[
        dict(
            text=stats_text,
            x=0.5,
            y=1.02,
            xref="paper",
            yref="paper",
            showarrow=False,
            font=dict(size=14),
            bgcolor="rgba(255, 255, 255, 0.9)",
            bordercolor=primary_color,
            borderwidth=1,
            borderpad=8,
            xanchor="center",
            align="center"
        )
    ],
    height=1000,
    width=1200,
    paper_bgcolor=bg_color,
    plot_bgcolor=bg_color,
    showlegend=False,
    margin=dict(t=120, b=50, l=50, r=50),
    font=dict(
        family="Segoe UI, Arial, sans-serif",
        size=12,
        color=text_color
    )
)

# Update axes with modern styling
for i in range(1, 4):
    for j in range(1, 3):
        if i == 3 and j == 2:
            continue  # Skip the empty subplot
        
        # Get the correct subplot reference
        subplot_idx = (i-1)*2+j
        xref = "" if i == 1 and j == 1 else subplot_idx
        
        # Update x-axis
        fig.update_xaxes(
            showgrid=True,
            gridcolor='#f0f0f0',
            zeroline=False,
            showline=True,
            linecolor='#e0e0e0',
            linewidth=1,
            tickfont=dict(family="Segoe UI, sans-serif", size=10),
            title_font=dict(family="Segoe UI, sans-serif", size=12, color=text_color),
            row=i, col=j
        )
        
        # Update y-axis
        fig.update_yaxes(
            showgrid=True,
            gridcolor='#f0f0f0',
            zeroline=False,
            showline=True,
            linecolor='#e0e0e0',
            linewidth=1,
            tickfont=dict(family="Segoe UI, sans-serif", size=10),
            title_font=dict(family="Segoe UI, sans-serif", size=12, color=text_color),
            row=i, col=j
        )

# Update axes labels with modern font
fig.update_xaxes(title_text="<span style='font-family: \"Segoe UI\", sans-serif; font-weight: 300;'>WER</span>", row=1, col=1)
fig.update_yaxes(title_text="<span style='font-family: \"Segoe UI\", sans-serif; font-weight: 300;'>Frequency</span>", row=1, col=1)

fig.update_xaxes(title_text="<span style='font-family: \"Segoe UI\", sans-serif; font-weight: 300;'>Transcription Length (words)</span>", row=1, col=2)
fig.update_yaxes(title_text="<span style='font-family: \"Segoe UI\", sans-serif; font-weight: 300;'>WER</span>", row=1, col=2)

fig.update_xaxes(title_text="<span style='font-family: \"Segoe UI\", sans-serif; font-weight: 300;'>Voice Sample</span>", row=2, col=1)
fig.update_yaxes(title_text="<span style='font-family: \"Segoe UI\", sans-serif; font-weight: 300;'>WER</span>", row=2, col=1)

fig.update_xaxes(title_text="<span style='font-family: \"Segoe UI\", sans-serif; font-weight: 300;'>Voice Sample</span>", row=2, col=2)
fig.update_yaxes(title_text="<span style='font-family: \"Segoe UI\", sans-serif; font-weight: 300;'>WER</span>", row=2, col=2)

fig.update_xaxes(title_text="<span style='font-family: \"Segoe UI\", sans-serif; font-weight: 300;'>WER Category</span>", row=3, col=1)
fig.update_yaxes(title_text="<span style='font-family: \"Segoe UI\", sans-serif; font-weight: 300;'>Transcription Length</span>", row=3, col=1)

# Add modern range slider for the WER by sample plot
fig.update_xaxes(
    rangeslider=dict(
        visible=True,
        thickness=0.05,
        bgcolor='#f8f9fa',
        bordercolor='#e0e0e0'
    ),
    row=2, col=2
)

# Add futuristic shapes to enhance the design - using paper coordinates for safety
# Add a subtle gradient background to the title area
fig.add_shape(
    type="rect",
    x0=0,
    y0=0.95,
    x1=1,
    y1=1,
    xref="paper",
    yref="paper",
    fillcolor=f"rgba{tuple(int(primary_color.lstrip('#')[i:i+2], 16) for i in (0, 2, 4)) + (0.05,)}",
    line_width=0,
    layer="below"
)

# Add subtle corner accents
for corner in [(0, 0), (1, 0), (0, 1), (1, 1)]:
    x, y = corner
    x_offset = 0.05 if x == 0 else -0.05
    y_offset = 0.05 if y == 0 else -0.05
    
    fig.add_shape(
        type="path",
        path=f"M {x} {y} L {x + x_offset} {y} L {x} {y + y_offset} Z",
        xref="paper",
        yref="paper",
        fillcolor=primary_color,
        line_width=0,
        opacity=0.3,
        layer="below"
    )

# Save as HTML file with additional configuration
config = {
    'displayModeBar': True,
    'modeBarButtonsToAdd': ['drawline', 'drawopenpath', 'eraseshape'],
    'toImageButtonOptions': {
        'format': 'png',
        'filename': 'futuristic_wer_dashboard',
        'height': 1000,
        'width': 1200,
        'scale': 2
    },
    'responsive': True
}

# Add custom CSS for a more futuristic look
custom_css = """
<style>
    .js-plotly-plot {
        font-family: 'Segoe UI', Arial, sans-serif !important;
    }
    .js-plotly-plot .plotly .modebar {
        background-color: rgba(255, 255, 255, 0.8) !important;
        border-radius: 20px !important;
        box-shadow: 0 4px 20px rgba(0, 0, 0, 0.1) !important;
    }
    .js-plotly-plot .plotly .modebar-btn {
        color: #3a86ff !important;
    }
    .js-plotly-plot .plotly .modebar-btn:hover {
        color: #ff006e !important;
    }
    .js-plotly-plot .plotly .main-svg {
        border-radius: 10px !important;
        box-shadow: 0 10px 30px rgba(0, 0, 0, 0.05) !important;
    }
    .js-plotly-plot .plotly .subplot {
        transition: all 0.3s ease !important;
    }
    .js-plotly-plot .plotly .subplot:hover {
        filter: brightness(1.02) !important;
    }
    /* Add futuristic glow effects */
    .js-plotly-plot .plotly .main-svg:hover {
        box-shadow: 0 15px 35px rgba(58, 134, 255, 0.1) !important;
    }
    /* Add subtle animation */
    @keyframes pulse {
        0% { opacity: 0.9; }
        50% { opacity: 1; }
        100% { opacity: 0.9; }
    }
    .js-plotly-plot .plotly .annotation-text {
        animation: pulse 3s infinite ease-in-out;
    }
</style>
"""

# Write to HTML with the config and custom CSS
with open('futuristic_wer_dashboard.html', 'w') as f:
    f.write(custom_css)
    f.write(fig.to_html(
        config=config,
        include_plotlyjs='cdn',
        include_mathjax='cdn',
        full_html=True
    ))

print("Futuristic modern light-themed dashboard saved as futuristic_wer_dashboard.html")

# Display the figure
fig.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# Import seaborn without the warning
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
import seaborn as sns

# For demonstration, let's create some sample data
np.random.seed(42)
sample_size = 100
voice_numbers = [f"Voice_{i}" for i in range(1, sample_size+1)]
wer_values = np.random.beta(2, 5, sample_size)  # Generate WER values between 0 and 1
transcription_lengths = np.random.randint(10, 200, sample_size)
avg_wer = np.mean(wer_values) * 100

merged_df = pd.DataFrame({
    'Voice_Number': voice_numbers,
    'WER': wer_values,
    'Transcription_original': [f"Sample text {i}" * (length // 10) for i, length in enumerate(transcription_lengths)],
    'Original_Length': transcription_lengths
})

# Create length categories
merged_df['Length_Category'] = pd.qcut(merged_df['Original_Length'], 
                                      q=5, 
                                      labels=['Very Short', 'Short', 'Medium', 'Long', 'Very Long'])

# Create WER categories
merged_df['WER_Category'] = pd.qcut(merged_df['WER'], 
                                   q=5, 
                                   labels=['Excellent', 'Good', 'Average', 'Poor', 'Very Poor'])

# Get top 10 highest WER samples
top_wer = merged_df.sort_values('WER', ascending=False).head(10)

# Define futuristic color palette
primary_color = '#3a86ff'  # Vibrant blue
secondary_color = '#ff006e'  # Neon pink
accent_color = '#8338ec'  # Purple
success_color = '#06d6a0'  # Teal
warning_color = '#ffbe0b'  # Amber

# Modern light background
bg_color = '#ffffff'
grid_color = '#f8f9fa'
text_color = '#212529'

# Create a more comprehensive interactive dashboard with futuristic styling
fig = make_subplots(
    rows=3, cols=2,
    specs=[
        [{"type": "histogram"}, {"type": "scatter"}],
        [{"type": "bar"}, {"type": "scatter"}],
        [{"colspan": 2, "type": "heatmap"}, None],
    ],
    subplot_titles=(
        '<span style="font-family: \'Segoe UI\', Arial, sans-serif; font-weight: 300;">WER Distribution</span>', 
        '<span style="font-family: \'Segoe UI\', Arial, sans-serif; font-weight: 300;">WER vs Transcription Length</span>',
        '<span style="font-family: \'Segoe UI\', Arial, sans-serif; font-weight: 300;">Top 10 Highest WER Samples</span>',
        '<span style="font-family: \'Segoe UI\', Arial, sans-serif; font-weight: 300;">WER by Voice Sample</span>',
        '<span style="font-family: \'Segoe UI\', Arial, sans-serif; font-weight: 300;">WER Performance Matrix</span>'
    ),
    vertical_spacing=0.12,
    horizontal_spacing=0.08
)

# 1. Futuristic WER distribution histogram with gradient
hist_data = go.Histogram(
    x=merged_df['WER'],
    nbinsx=20,
    name='WER Distribution',
    marker=dict(
        color=merged_df['WER'],
        colorscale=[[0, primary_color], [1, secondary_color]],
        opacity=0.8,
        line=dict(color='white', width=0.5)
    ),
    hovertemplate='<span style="font-family: \'Segoe UI\', sans-serif;">WER: %{x:.4f}<br>Count: %{y}</span><extra></extra>'
)
fig.add_trace(hist_data, row=1, col=1)

# Add KDE curve - calculate manually to avoid seaborn warning
from scipy import stats
kde_x = np.linspace(0, merged_df['WER'].max(), 100)
kde_y = stats.gaussian_kde(merged_df['WER'])(kde_x)
fig.add_trace(
    go.Scatter(
        x=kde_x,
        y=kde_y * (merged_df['WER'].count() / 5),  # Scale to match histogram
        mode='lines',
        line=dict(
            color=secondary_color,
            width=3,
            shape='spline',
            smoothing=1.3
        ),
        name='Density',
        hoverinfo='skip'
    ),
    row=1, col=1
)

# 2. Modern scatter plot for WER vs transcription length
fig.add_trace(
    go.Scatter(
        x=merged_df['Original_Length'],
        y=merged_df['WER'],
        mode='markers',
        marker=dict(
            size=10,
            color=merged_df['WER'],
            colorscale=[
                [0, success_color],
                [0.5, warning_color],
                [1, secondary_color]
            ],
            showscale=True,
            colorbar=dict(
                title="<span style='font-family: \"Segoe UI\", sans-serif; font-weight: 300;'>WER</span>",
                thickness=15,
                len=0.5,
                y=0.8,
                yanchor="top",
                tickfont=dict(family="Segoe UI, sans-serif"),
                titlefont=dict(family="Segoe UI, sans-serif")
            ),
            opacity=0.8,
            symbol='circle'
        ),
        name='WER vs Length',
        text=merged_df['Voice_Number'],
        hovertemplate='<span style="font-family: \'Segoe UI\', sans-serif; font-weight: 300;"><b>%{text}</b><br>Length: %{x} words<br>WER: %{y:.4f}</span><extra></extra>'
    ),
    row=1, col=2
)

# Add trend line with gradient
z = np.polyfit(merged_df['Original_Length'], merged_df['WER'], 1)
p = np.poly1d(z)
x_range = np.linspace(merged_df['Original_Length'].min(), merged_df['Original_Length'].max(), 100)
fig.add_trace(
    go.Scatter(
        x=x_range,
        y=p(x_range),
        mode='lines',
        line=dict(
            color=accent_color,
            width=3,
            dash='solid'
        ),
        name='Trend',
        hoverinfo='skip'
    ),
    row=1, col=2
)

# 3. Futuristic bar chart for top 10 highest WER
fig.add_trace(
    go.Bar(
        x=top_wer['Voice_Number'],
        y=top_wer['WER'],
        marker=dict(
            color=top_wer['WER'],
            colorscale=[
                [0, warning_color],
                [0.5, secondary_color],
                [1, '#ff0000']  # Bright red for highest values
            ],
            line=dict(color='white', width=0.5),
            opacity=0.9
        ),
        name='Top 10 Highest WER',
        hovertemplate='<span style="font-family: \'Segoe UI\', sans-serif;"><b>%{x}</b><br>WER: %{y:.4f}</span><extra></extra>'
    ),
    row=2, col=1
)

# 4. Modern line chart for WER by sample with area fill
sorted_df = merged_df.sort_values('Voice_Number')
fig.add_trace(
    go.Scatter(
        x=sorted_df['Voice_Number'],
        y=sorted_df['WER'],
        mode='lines',
        line=dict(
            color=primary_color,
            width=3,
            shape='spline',
            smoothing=1.3
        ),
        fill='tozeroy',
        fillcolor=f'rgba{tuple(int(primary_color.lstrip("#")[i:i+2], 16) for i in (0, 2, 4)) + (0.2,)}',
        name='WER by Sample',
        hovertemplate='<span style="font-family: \'Segoe UI\', sans-serif;"><b>%{x}</b><br>WER: %{y:.4f}</span><extra></extra>'
    ),
    row=2, col=2
)

# Add markers on top of the line
fig.add_trace(
    go.Scatter(
        x=sorted_df['Voice_Number'],
        y=sorted_df['WER'],
        mode='markers',
        marker=dict(
            size=6,
            color=primary_color,
            line=dict(color='white', width=1)
        ),
        showlegend=False,
        hoverinfo='skip'
    ),
    row=2, col=2
)

# 5. Futuristic heatmap for WER by transcription length category
heatmap_data = pd.crosstab(
    merged_df['Length_Category'], 
    merged_df['WER_Category'], 
    normalize='index'
)

fig.add_trace(
    go.Heatmap(
        z=heatmap_data.values,
        x=heatmap_data.columns,
        y=heatmap_data.index,
        colorscale=[
            [0, success_color],
            [0.5, warning_color],
            [1, secondary_color]
        ],
        text=np.around(heatmap_data.values, 2),
        texttemplate="%{text:.2f}",
        textfont=dict(family="Segoe UI, sans-serif", size=12, color='white'),
        hovertemplate='<span style="font-family: \'Segoe UI\', sans-serif;"><b>Length: %{y}</b><br>WER Category: %{x}<br>Proportion: %{z:.2f}</span><extra></extra>'
    ),
    row=3, col=1
)

# Add horizontal line for average WER across all applicable plots with modern styling
# FIX: Use the correct xref format for each subplot
for i in range(1, 3):
    for j in range(1, 3):
        # Correct xref and yref format
        xref = "x" if i == 1 and j == 1 else f"x{(i-1)*2+j}"
        yref = "y" if i == 1 and j == 1 else f"y{(i-1)*2+j}"
        
        fig.add_shape(
            type="line",
            x0=0,
            y0=avg_wer/100,
            x1=1,
            y1=avg_wer/100,
            xref=f"{xref} domain",
            yref=yref,
            line=dict(color=secondary_color, width=2, dash="dot"),
        )
        
        # Add futuristic annotation for average WER
        fig.add_annotation(
            x=0.95 if j == 1 else 0.05,
            y=avg_wer/100,
            xref=xref,
            yref=yref,
            text=f"Avg WER: {avg_wer:.2f}%",
            showarrow=False,
            font=dict(family="Segoe UI, sans-serif", color=secondary_color, size=12),
            bgcolor="rgba(255, 255, 255, 0.9)",
            bordercolor=secondary_color,
            borderwidth=1,
            borderpad=4,
            xanchor="right" if j == 1 else "left",
            opacity=0.9,
            align="center"
        )

# Add futuristic summary statistics at the top
stats_text = (
    f"<span style='font-family: \"Segoe UI\", sans-serif; font-weight: 300; color: {text_color};'>"
    f"<b>WER Analysis Summary</b><br>"
    f"Sample Count: <b>{len(merged_df)}</b> | "
    f"Average WER: <b style='color: {secondary_color}'>{avg_wer:.2f}%</b> | "
    f"Median WER: <b>{np.median(merged_df['WER'])*100:.2f}%</b> | "
    f"Min WER: <b style='color: {success_color}'>{merged_df['WER'].min()*100:.2f}%</b> | "
    f"Max WER: <b style='color: #ff0000'>{merged_df['WER'].max()*100:.2f}%</b> | "
    f"Std Dev: <b>{np.std(merged_df['WER'])*100:.2f}%</b>"
    f"</span>"
)

# Update layout with futuristic styling
fig.update_layout(
    title={
        'text': "<span style='font-family: \"Segoe UI\", Arial, sans-serif; font-weight: 200; letter-spacing: 1px;'>"
                "WHISPER TRANSCRIPTION ANALYSIS DASHBOARD</span>",
        'y':0.98,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        'font': {'size': 28, 'color': text_color}
    },
    annotations=[
        dict(
            text=stats_text,
            x=0.5,
            y=1.02,
            xref="paper",
            yref="paper",
            showarrow=False,
            font=dict(size=14),
            bgcolor="rgba(255, 255, 255, 0.9)",
            bordercolor=primary_color,
            borderwidth=1,
            borderpad=8,
            xanchor="center",
            align="center"
        )
    ],
    height=1000,
    width=1200,
    paper_bgcolor=bg_color,
    plot_bgcolor=bg_color,
    showlegend=False,
    margin=dict(t=120, b=50, l=50, r=50),
    font=dict(
        family="Segoe UI, Arial, sans-serif",
        size=12,
        color=text_color
    )
)

# Update axes with modern styling
for i in range(1, 4):
    for j in range(1, 3):
        if i == 3 and j == 2:
            continue  # Skip the empty subplot
        
        # Get the correct subplot reference
        subplot_idx = (i-1)*2+j
        xref = "" if i == 1 and j == 1 else subplot_idx
        
        # Update x-axis
        fig.update_xaxes(
            showgrid=True,
            gridcolor='#f0f0f0',
            zeroline=False,
            showline=True,
            linecolor='#e0e0e0',
            linewidth=1,
            tickfont=dict(family="Segoe UI, sans-serif", size=10),
            title_font=dict(family="Segoe UI, sans-serif", size=12, color=text_color),
            row=i, col=j
        )
        
        # Update y-axis
        fig.update_yaxes(
            showgrid=True,
            gridcolor='#f0f0f0',
            zeroline=False,
            showline=True,
            linecolor='#e0e0e0',
            linewidth=1,
            tickfont=dict(family="Segoe UI, sans-serif", size=10),
            title_font=dict(family="Segoe UI, sans-serif", size=12, color=text_color),
            row=i, col=j
        )

# Update axes labels with modern font
fig.update_xaxes(title_text="<span style='font-family: \"Segoe UI\", sans-serif; font-weight: 300;'>WER</span>", row=1, col=1)
fig.update_yaxes(title_text="<span style='font-family: \"Segoe UI\", sans-serif; font-weight: 300;'>Frequency</span>", row=1, col=1)

fig.update_xaxes(title_text="<span style='font-family: \"Segoe UI\", sans-serif; font-weight: 300;'>Transcription Length (words)</span>", row=1, col=2)
fig.update_yaxes(title_text="<span style='font-family: \"Segoe UI\", sans-serif; font-weight: 300;'>WER</span>", row=1, col=2)

fig.update_xaxes(title_text="<span style='font-family: \"Segoe UI\", sans-serif; font-weight: 300;'>Voice Sample</span>", row=2, col=1)
fig.update_yaxes(title_text="<span style='font-family: \"Segoe UI\", sans-serif; font-weight: 300;'>WER</span>", row=2, col=1)

fig.update_xaxes(title_text="<span style='font-family: \"Segoe UI\", sans-serif; font-weight: 300;'>Voice Sample</span>", row=2, col=2)
fig.update_yaxes(title_text="<span style='font-family: \"Segoe UI\", sans-serif; font-weight: 300;'>WER</span>", row=2, col=2)

fig.update_xaxes(title_text="<span style='font-family: \"Segoe UI\", sans-serif; font-weight: 300;'>WER Category</span>", row=3, col=1)
fig.update_yaxes(title_text="<span style='font-family: \"Segoe UI\", sans-serif; font-weight: 300;'>Transcription Length</span>", row=3, col=1)

# Add modern range slider for the WER by sample plot
fig.update_xaxes(
    rangeslider=dict(
        visible=True,
        thickness=0.05,
        bgcolor='#f8f9fa',
        bordercolor='#e0e0e0'
    ),
    row=2, col=2
)

# Add futuristic shapes to enhance the design - using paper coordinates for safety
# Add a subtle gradient background to the title area
fig.add_shape(
    type="rect",
    x0=0,
    y0=0.95,
    x1=1,
    y1=1,
    xref="paper",
    yref="paper",
    fillcolor=f"rgba{tuple(int(primary_color.lstrip('#')[i:i+2], 16) for i in (0, 2, 4)) + (0.05,)}",
    line_width=0,
    layer="below"
)

# Add subtle corner accents
for corner in [(0, 0), (1, 0), (0, 1), (1, 1)]:
    x, y = corner
    x_offset = 0.05 if x == 0 else -0.05
    y_offset = 0.05 if y == 0 else -0.05
    
    fig.add_shape(
        type="path",
        path=f"M {x} {y} L {x + x_offset} {y} L {x} {y + y_offset} Z",
        xref="paper",
        yref="paper",
        fillcolor=primary_color,
        line_width=0,
        opacity=0.3,
        layer="below"
    )

# Save as HTML file with additional configuration
config = {
    'displayModeBar': True,
    'modeBarButtonsToAdd': ['drawline', 'drawopenpath', 'eraseshape'],
    'toImageButtonOptions': {
        'format': 'png',
        'filename': 'futuristic_wer_dashboard',
        'height': 1000,
        'width': 1200,
        'scale': 2
    },
    'responsive': True
}

# Add custom CSS for a more futuristic look
custom_css = """
<style>
    .js-plotly-plot {
        font-family: 'Segoe UI', Arial, sans-serif !important;
    }
    .js-plotly-plot .plotly .modebar {
        background-color: rgba(255, 255, 255, 0.8) !important;
        border-radius: 20px !important;
        box-shadow: 0 4px 20px rgba(0, 0, 0, 0.1) !important;
    }
    .js-plotly-plot .plotly .modebar-btn {
        color: #3a86ff !important;
    }
    .js-plotly-plot .plotly .modebar-btn:hover {
        color: #ff006e !important;
    }
    .js-plotly-plot .plotly .main-svg {
        border-radius: 10px !important;
        box-shadow: 0 10px 30px rgba(0, 0, 0, 0.05) !important;
    }
    .js-plotly-plot .plotly .subplot {
        transition: all 0.3s ease !important;
    }
    .js-plotly-plot .plotly .subplot:hover {
        filter: brightness(1.02) !important;
    }
    /* Add futuristic glow effects */
    .js-plotly-plot .plotly .main-svg:hover {
        box-shadow: 0 15px 35px rgba(58, 134, 255, 0.1) !important;
    }
    /* Add subtle animation */
    @keyframes pulse {
        0% { opacity: 0.9; }
        50% { opacity: 1; }
        100% { opacity: 0.9; }
    }
    .js-plotly-plot .plotly .annotation-text {
        animation: pulse 3s infinite ease-in-out;
    }
</style>
"""

# Write to HTML with the config and custom CSS
with open('futuristic_wer_dashboard.html', 'w') as f:
    f.write(custom_css)
    f.write(fig.to_html(
        config=config,
        include_plotlyjs='cdn',
        include_mathjax='cdn',
        full_html=True
    ))

print("Futuristic modern light-themed dashboard saved as futuristic_wer_dashboard.html")

# Display the figure
fig.show()

# Krithika's Dataset

## OpenAI Whisper Model

## ENGLISH TO TAMIL CONVERSION