# Import Required Libraries

In [46]:
import altair as alt
import pandas as pd
import numpy as np

# Languages - Covered : Static

In [2]:


# Data: Each row is a (Task, Language, Covered) triple
data = [
    # Speech-to-Text
    ("Speech-to-Text", "Hindi", True),
    ("Speech-to-Text", "Bengali", True),
    ("Speech-to-Text", "English", True),

    # Speaker Identification
    ("Speaker Identification", "Hindi", True),
    ("Speaker Identification", "Bengali", True),
    ("Speaker Identification", "English", True),

    # Sentiment Analysis
    ("Sentiment Analysis", "Spanish", True),
    ("Sentiment Analysis", "English", True),

    # Offensive Language Identification
    ("Offensive Language", "Hindi", True),
    ("Offensive Language", "Bengali", True),
    ("Offensive Language", "English", True),

    # Named Entity Recognition
    ("NER", "Spanish", True),
    ("NER", "English", True),
    ("NER", "Hindi", True),
    ("NER", "Modern Standard Arabic", True),
    ("NER", "Egyptian Arabic", True),

    # Language Identification
    ("Language ID", "Spanish", True),
    ("Language ID", "English", True),
    ("Language ID", "Hindi", True),
    ("Language ID", "Nepali", True),
    ("Language ID", "Modern Standard Arabic", True),
    ("Language ID", "Egyptian Arabic", True),
]

df = pd.DataFrame(data, columns=["Task", "Language", "Covered"])

# Create the heatmap
chart = alt.Chart(df).mark_rect().encode(
    x=alt.X('Language:N', title='Language'),
    y=alt.Y('Task:N', title='NLP Task'),
    color=alt.condition(
        alt.datum.Covered,
        alt.value('mediumseagreen'),  # Covered - green
        alt.value('lightgray')        # Not covered - gray (won't appear here)
    ),
    tooltip=['Task', 'Language']
).properties(
    width=500,
    height=300,
    title="Code-Mixed Language Coverage Across NLP Tasks"
)

# Add text tick (✓)
text = alt.Chart(df).mark_text(
    align='center',
    baseline='middle',
    fontSize=15,
    color='white'
).encode(
    x='Language:N',
    y='Task:N',
    text=alt.value('✓')
).transform_filter(
    alt.datum.Covered
)

# Combine both
chart + text


# Speech-to-text and Speaker-Identification

In [75]:
hin_eng_gt_file = "../data/Speech-to-Text/Hindi-English_test_test.csv"
ben_eng_gt_file = "../data/Speech-to-Text/Bengali-English_test_test.csv"

In [76]:
hin_eng_df = pd.read_csv(hin_eng_gt_file)
ben_eng_df = pd.read_csv(ben_eng_gt_file)

In [77]:
hin_eng_df.head()

Unnamed: 0,speaker_id,utt_id,file_id,start,end,transript,file_path,chunk_path,Whisper-Base,Whisper-Medium,Whisper-Large-v2,Wav2Vec2-Large
0,103085,103085_w5Jyq3XMbb3WwiKQ_0000,w5Jyq3XMbb3WwiKQ,0.0,8.0,लिबर ऑफिस impress में एक प्रस्तुति document बन...,../../data/Speech-to-Text/Hindi-English_test/t...,../../data/Speech-to-Text/Hindi-English_test/t...,"Libber Office Imprasme, 1 Prasthuti Document ...",لیبر آفس ایمپریس میں ایک پرستوٹی ڈاکیومنٹ بنا...,لبر آفس ایمپریس میں ایک پرستوطی ڈاکیومنٹ بنان...,LIBERA OFFICE IMPRESS ME EPUSTODY DOCUMENT BAN...
1,103085,103085_w5Jyq3XMbb3WwiKQ_0001,w5Jyq3XMbb3WwiKQ,8.0,21.0,इस tutorial में हम impress window के भागों के ...,../../data/Speech-to-Text/Hindi-English_test/t...,../../data/Speech-to-Text/Hindi-English_test/t...,आपान्ता भान्ता भान्ता भान्ता भान्ता भान्ता भा...,हम इंप्रैस विंडो के भागो के बारे में सीखेंगे ...,इस टीटूरल में हम इंप्रेस विंडो के भागो के बार...,OTTE ISTIDEL ME HUM IMPRESS VINDUGIB HAGOGIBAR...
2,103085,103085_w5Jyq3XMbb3WwiKQ_0002,w5Jyq3XMbb3WwiKQ,21.0,29.0,यहाँ हम अपने ऑपरेटिंग सिस्टम के रूप में gnu/li...,../../data/Speech-to-Text/Hindi-English_test/t...,../../data/Speech-to-Text/Hindi-English_test/t...,یہاں ہم اپنے اور پڑڑھڑھڑھڑھڑھڑھڑھڑھڑھڑھڑھڑھڑھ...,यहां हम अपने अपरेटिंग सिस्टम के रूप में जेनू ...,यहाँ हम अपने Operating System के रूप में JNU ...,YAHAM OBNE OR BREDDING SYSTEM GEROPE ME GENU L...
3,103085,103085_w5Jyq3XMbb3WwiKQ_0003,w5Jyq3XMbb3WwiKQ,29.0,35.0,चलिए अपनी प्रस्तुति प्रेजैटेशन sample impress ...,../../data/Speech-to-Text/Hindi-English_test/t...,../../data/Speech-to-Text/Hindi-English_test/t...,तो दो लो लो लो लो लो लो लो लो लो लो लो लो लो ...,चलिये अपनी प्रस्थुती सामपल इम्प्रस्थ अपन करते...,चल्ये अपनी प्रस्तुति सैंपल इम्प्लस ओपन करते ह...,JELLY ABNIPOSTUDI SAMPLE IMPLAS POPEN GARTINER...
4,103085,103085_w5Jyq3XMbb3WwiKQ_0004,w5Jyq3XMbb3WwiKQ,35.0,39.0,चलिए देखते हैं कि screen पर क्या क्या है,../../data/Speech-to-Text/Hindi-English_test/t...,../../data/Speech-to-Text/Hindi-English_test/t...,", we will see what is on the screen.",बनाया था।।।।।।।।।।।।।।।।।।।।।।।।।।।।।।।।।।।।।...,चलये देखते हैं कि स्क्रीन पर क्या-क्या है,EBONAIATA JELLY DE TANKEES GREENBERD KARKYAHER


In [88]:
def preprocess(df, label):
    df = df.copy()
    df["duration"] = df["end"] - df["start"]
    df["word_count"] = df["transript"].str.split().str.len()
    df["language"] = label
    return df

# Preprocess both dataframes
hin_eng_df = preprocess(hin_eng_df, "Hindi-English")
ben_eng_df = preprocess(ben_eng_df, "Bengali-English")

# Function to compute per-speaker summary in long format
def speaker_summary_long(df):
    summary = df.groupby("speaker_id").agg({
    "utt_id": "count",
    "word_count": "mean",
    "duration": "mean"
}).rename(columns={
    "utt_id": "Utterance Count",
    "word_count": "Average Word Count",
    "duration": "Average Duration"
}).reset_index()

    # Melt to long format
    long_df = summary.melt(id_vars="speaker_id", 
                           value_vars=["Utterance Count", "Average Word Count", "Average Duration"],
                           var_name="Metric",
                           value_name="Value")
    return long_df

# Get long-format summaries
hin_long = speaker_summary_long(hin_eng_df)
ben_long = speaker_summary_long(ben_eng_df)

In [90]:
def plot_summary(data, title_text):
    chart = alt.Chart(data).mark_bar().encode(
        x=alt.X("speaker_id:N", title="Speaker ID", axis=alt.Axis(labelAngle=-45)), 
        y=alt.Y("Value:Q"),
        color=alt.Color(
            "Metric:N", 
            title="Metric",
            scale=alt.Scale(domain=["Utterance Count", "Average Word Count", "Average Duration"]),
            legend=alt.Legend(
                orient="bottom",               # Put legend at the bottom
                direction="horizontal",       # Make legend horizontal
                titleOrient="top",            # Put legend title on top
                offset=10                     # Distance from chart
            )
        ),
        tooltip=["speaker_id:N", "Metric:N", "Value:Q"]
    ).properties(
        title=title_text,
        width=600,
        height=300
    )
    return chart



# Use the same plot calls
hin_plot = plot_summary(hin_long, "Hindi-English Speaker Stats")
ben_plot = plot_summary(ben_long, "Bengali-English Speaker Stats")
final = hin_plot | ben_plot  # horizontal layout

final

# Offensive Language Identification

In [83]:
file_path = "../data/Offensive-Language-Identification/gt.csv"
offensive_df = pd.read_csv(file_path)
# Relabel 'labels' column
offensive_df['labels'] = offensive_df['labels'].replace({
    'NOT': 'Not Offensive',
    'OFF': 'Offensive'
})

In [84]:
offensive_df.head()

Unnamed: 0,sentences,labels,processed_sentence,xlmr,mdeberta,labse,muril
0,विश्लेषण | डेविन Nunes explains ‘antifa’ এবং '...,Not Offensive,वशलषण डवन nunes explains antifa এব সমসত य दग ...,OFF,NOT,OFF,NOT
1,@USER Diversity बनाता है आप मजबूत,Not Offensive,diversity बनत ह आप मजबत,OFF,OFF,OFF,NOT
2,... आत्म-आक्रामक उदारवादियों का एक समूह in fan...,Offensive,आतमआकरमक उदरवदय क एक समह in fancy dresses mock...,OFF,NOT,OFF,NOT
3,@USER @USER को इससे उबरने की जरूरत है। काफ़ी ह...,Not Offensive,क इसस उबरन क जररत ह कफ हद तक 5 বছর ধর তন পখদর ...,OFF,OFF,OFF,NOT
4,@USER @USER GET OUT मेरा MENTIONS (i stand cor...,Not Offensive,get out मर mentions i stand corrected,OFF,NOT,OFF,NOT


In [85]:
label_counts = offensive_df['labels'].value_counts().reset_index()
label_counts.columns = ['label', 'count']
label_counts['percent'] = label_counts['count'] / label_counts['count'].sum()
label_counts['percent_label'] = (label_counts['percent'] * 100).round(1).astype(str) + '%'

offensive_base = alt.Chart(label_counts).encode(
    theta=alt.Theta(field="count", type="quantitative", stack=True),
    color=alt.Color(field="label", type="nominal", title="Label", legend=alt.Legend(orient='bottom')),
    tooltip=[
        alt.Tooltip('label:N'),
        alt.Tooltip('count:Q'),
        alt.Tooltip('percent:Q', format='.1%')
    ]
)

offensive_pie = offensive_base.mark_arc(outerRadius=100)
offensive_text = offensive_base.mark_text(radius=130, size=14, color='black').encode(
    text='percent_label:N'
)

offensive_chart = (offensive_pie + offensive_text).properties(
    title='Offensive Language Dataset Distribution',
    width=350,
    height=350
)

# Sentiment Analysis

In [88]:
SA_file_path = "../data/Sentiment-Analysis/test.csv"
SA_df = pd.read_csv(SA_file_path)
SA_df['labels'] = SA_df['labels'].replace({
    'positive': 'Positive',
    'negative': 'Negative',
    'neutral': 'Neutral'
})

In [89]:
SA_df.head()

Unnamed: 0,sentences,labels,processed_sentence,xlmr,mdeberta,labse,muril
0,@brissamayen @sanluispotoyees estopp I blashhh...,Positive,estopp i blashhh lol jk but aww thanks haha x,negative,neutral,positive,positive
1,Qué mejor que pasar Valentine 's thirdwheelean...,Positive,qué mejor que pasar valentine s thirdwheeleand...,positive,neutral,neutral,positive
2,#FF y de super #fashionfabolous a mi colega @b...,Positive,y de super a mi colega el duo stylist de y mas,negative,neutral,neutral,positive
3,“ @AZUCENACIERCO : Look de hoy gracias a @Angi...,Positive,look de hoy gracias a me encanto flaca de ...,negative,positive,neutral,positive
4,RT @andyescalona : #brindando #celebrando #tod...,Positive,rt cc,negative,neutral,neutral,positive


In [90]:
# Calculate sentence length in words
SA_df['length'] = SA_df['processed_sentence'].apply(lambda x: len(str(x).split()))

In [91]:
label_stats = SA_df.groupby('labels').agg(
    count=('labels', 'count'),
    avg_len=('length', 'mean'),
).reset_index()

label_stats['percent'] = label_stats['count'] / label_stats['count'].sum()
label_stats['percent_label'] = (label_stats['percent'] * 100).round(1).astype(str) + '%'

sentiment_base = alt.Chart(label_stats).encode(
    theta=alt.Theta("count:Q", stack=True),
    color=alt.Color("labels:N", title="Label", legend=alt.Legend(orient='bottom')),
    tooltip=[
        alt.Tooltip("labels:N"),
        alt.Tooltip("count:Q", title="Total"),
        alt.Tooltip("percent:Q", title="Percent", format=".1%"),
        alt.Tooltip("avg_len:Q", title="Avg Length", format=".1f")
    ]
)

sentiment_pie = sentiment_base.mark_arc(outerRadius=100)
sentiment_text = sentiment_base.mark_text(radius=120, size=14, color='black').encode(
    text='percent_label:N'
)

sentiment_chart = (sentiment_pie + sentiment_text).properties(
    title='Sentiment Dataset Distribution',
    width=350,
    height=350
)

In [92]:
combined_chart = (offensive_chart | sentiment_chart).configure_legend(
    orient='bottom',
    direction='horizontal',
    titleFontSize=14,
    labelFontSize=12,
    labelLimit=300
)

combined_chart.show()

# Language Identification

In [93]:
Lid_file_path = "../data/Language-Identification/gt.csv"
Lid_df = pd.read_csv(Lid_file_path)
Lid_df['labels'] = Lid_df['labels'].str.capitalize()

Lid_df

Unnamed: 0,words,labels,xlmr
0,@ZahirJ,Other,Egypt Arabic
1,@BinyavangaW,Other,Nepali
2,Loved,English,Nepali
3,the,English,Egypt Arabic
4,ending,English,Egypt Arabic
...,...,...,...
97083,OLEEE,Nepali,Egypt Arabic
97084,!!!,Other,Egypt Arabic
97085,ABOTABOTABOTABOOOOOO,Nepali,Egypt Arabic
97086,!!!!!!,Other,Egypt Arabic


In [96]:
label_counts = (
    Lid_df['labels'].value_counts()
    .reset_index()
    .rename(columns={'index': 'labels', 'labels': 'count'})
)
label_counts.columns = ['labels', 'count']
label_counts['percent'] = (label_counts['count'] / label_counts['count'].sum() * 100).round(1)

LID_donut = alt.Chart(label_counts).mark_arc(innerRadius=60).encode(
    theta='count:Q',
    color=alt.Color('labels:N', legend=None),
    tooltip=[
        alt.Tooltip('labels:N', title='Label'),
        alt.Tooltip('count:Q', title='Count'),
        alt.Tooltip('percent:Q', title='Percentage')
    ]
).properties(width=400, height=300, title='Language Identification Label Distribution')

# Add index for grid layout
label_counts['lid_index'] = range(len(label_counts))
label_counts['column'] = label_counts['lid_index'] % 3
label_counts['row'] = label_counts['lid_index'] // 3

LID_legend = (
    alt.Chart(label_counts).mark_square(size=100).encode(
        x=alt.X('column:O', axis=None),
        y=alt.Y('row:O', axis=None),
        color=alt.Color('labels:N', legend=None)
    ) +
    alt.Chart(label_counts).mark_text(align='left', dx=12).encode(
        x=alt.X('column:O', axis=None),
        y=alt.Y('row:O', axis=None),
        text='labels:N'
    )
).properties(title="Language Identification Label", width=500)

LID_full = alt.vconcat(LID_donut, LID_legend).resolve_scale(color='independent')

# Named Entity recognition

In [94]:
NER_file_path = "../data/Named-Entity-Recognition/gt.csv"
NER_df = pd.read_csv(NER_file_path)
NER_df['labels'] = NER_df['labels'].str.capitalize()
NER_df

Unnamed: 0,words,labels
0,stupid,O
1,move,O
2,",",O
3,considering,O
4,their,O
...,...,...
150757,un,O
150758,trabajo,O
150759,de,O
150760,verdad,O


In [97]:
ner_counts = NER_df['labels'].value_counts().reset_index()
ner_counts.columns = ['labels', 'count']
filtered = ner_counts[ner_counts['labels'] != 'O']

NER_donut = alt.Chart(filtered).mark_arc(innerRadius=60).encode(
    theta='count:Q',
    color=alt.Color('labels:N', legend=None),
    tooltip=[
        alt.Tooltip('labels:N', title='NER Label'),
        alt.Tooltip('count:Q', title='Token Count')
    ]
).properties(
    width=400,
    height=300,
    title='NER Label Distribution (Excl. "O")'
)

NER_legend = alt.Chart(filtered).mark_square(size=100).encode(
    x=alt.X('column:O', axis=None),
    y=alt.Y('row:O', axis=None),
    color=alt.Color('labels:N', legend=None)
).transform_calculate(
    column="datum.index % 3",
    row="~~(datum.index / 3)"
).transform_window(
    index='rank()'
) + alt.Chart(filtered).mark_text(align='left', dx=12).encode(
    x=alt.X('column:O', axis=None),
    y=alt.Y('row:O', axis=None),
    text='labels:N'
).transform_calculate(
    column="datum.index % 3",
    row="~~(datum.index / 3)"
).transform_window(
    index='rank()'
)

NER_full = alt.vconcat(NER_donut, NER_legend).resolve_scale(color='independent')

# Side-by-side chart
final_chart = alt.hner_counts = NER_df['labels'].value_counts().reset_index()
ner_counts.columns = ['labels', 'count']
filtered = ner_counts[ner_counts['labels'] != 'O'].copy()

filtered['ner_index'] = range(len(filtered))
filtered['column'] = filtered['ner_index'] % 3
filtered['row'] = filtered['ner_index'] // 3

NER_donut = alt.Chart(filtered).mark_arc(innerRadius=60).encode(
    theta='count:Q',
    color=alt.Color('labels:N', legend=None),
    tooltip=[
        alt.Tooltip('labels:N', title='NER Label'),
        alt.Tooltip('count:Q', title='Token Count')
    ]
).properties(width=400, height=300, title='NER Distribution (Excl. "O")')

NER_legend = (
    alt.Chart(filtered).mark_square(size=100).encode(
        x=alt.X('column:O', axis=None),
        y=alt.Y('row:O', axis=None),
        color=alt.Color('labels:N', legend=None)
    ) +
    alt.Chart(filtered).mark_text(align='left', dx=12).encode(
        x=alt.X('column:O', axis=None),
        y=alt.Y('row:O', axis=None),
        text='labels:N'
    )
).properties(title="Named Entity Recognition Label", width=400)

NER_full = alt.vconcat(NER_donut, NER_legend).resolve_scale(color='independent')

# ==== Side-by-side display ====
final_chart = alt.hconcat(LID_full, NER_full, spacing=40).resolve_scale(color='independent')
final_chart = final_chart.configure_view(stroke=None)
final_chart

# Qualitative Data

###  Hate Speech

In [100]:
HS_filepath = "../data/Qualitative/Hate Speech.csv"
HS_df = pd.read_csv(HS_filepath)
HS_df['Label'] = HS_df['Label'].replace({
    'Hate Speech': 'Offensive',
    'Non-Hate Speech': 'Not Offensive'
})
HS_df.head()

Unnamed: 0,Text,Code-mix-language,Label,processed_sentence,xlmr,mdeberta,labse,muril
0,Ada paavingala kgf oda vachathukaga engaluku y...,English-Tamil,Offensive,ada paavingala kgf oda vachathukaga engaluku y...,Non-Hate Speech,Non-Hate Speech,Hate Speech,Non-Hate Speech
1,dei kelthu kuthi ajith ivanya yaru nadika sona,English-Tamil,Offensive,dei kelthu kuthi ajith ivanya yaru nadika sona,Hate Speech,Non-Hate Speech,Hate Speech,Non-Hate Speech
2,Trailer mokka da ajith kelatu payale,English-Tamil,Offensive,trailer mokka da ajith kelatu payale,Hate Speech,Non-Hate Speech,Hate Speech,Non-Hate Speech
3,Rajini poi thokula thoku itha Veda yaru Una ke...,English-Tamil,Offensive,rajini poi thokula thoku itha veda yaru una ke...,Non-Hate Speech,Non-Hate Speech,Hate Speech,Non-Hate Speech
4,கடைசில rajini ய வச்சு செஞ்சுடிங்களே டா,English-Tamil,Offensive,கடசல rajini ய வசச சஞசடஙகள ட,Hate Speech,Non-Hate Speech,Hate Speech,Non-Hate Speech


In [101]:
df_token_level = HS_df.copy()
df_token_level['tokens'] = df_token_level['processed_sentence'].str.split()
df_token_level = df_token_level.explode('tokens').reset_index(drop=True)

# Rename 'Label' to indicate it's now being treated as a token-level label
df_token_level = df_token_level.rename(columns={'Label': 'Token_Label'})
# Step 2: Count tokens by Code-Mix Language and Token Label
label_by_token_lang = df_token_level.groupby(['Code-mix-language', 'Token_Label']).size().reset_index(name='count')


In [104]:
# Step 3: Stacked bar chart
stacked_bar_HS = alt.Chart(label_by_token_lang).mark_bar().encode(
    x=alt.X('Code-mix-language:N', title='Code-Mix Language', sort='-y'),
    y=alt.Y('count:Q', title='Token Count'),
    color=alt.Color('Token_Label:N', title='Label'),
    tooltip=['Code-mix-language:N', 'Token_Label:N', 'count:Q']
).properties(
    title='Offensive Language Dataset Distribution',
    width=400,
    height=350
)

### Sentiment Analysis

In [103]:
SA_filepath = "../data/Qualitative/Sentiment-Analysis.csv"
SA_df = pd.read_csv(SA_filepath)
SA_df.head()

Unnamed: 0,Text,Code-mix-language,Label,xlmr,mdeberta,labse,muril
0,Ee Sala Cup Namde,English- Kannada,Positive,English- Kannada,English- Kannada,English- Kannada,unk
1,ಏನ್ ಗುರು ಇದು..️ Get ready for History.. ನೆನಪಿರ...,English- Kannada,Positive,English- Tamil,English- Kannada,English- Hindi,unk
2,Film end ಟೈಟಲ್ ಕಾರ್ಡ್ music,English- Kannada,Neutral,English- Hindi,English- Kannada,English- Tamil,unk
3,ಹೆಮ್ಮೆ ಪಡುವ ಸಂಗತಿ ntr ಮತ್ತು ramcharan voice,English- Kannada,Positive,English- Hindi,English- Kannada,English- Kannada,unk
4,Jr ntr and ramcharan vico ತುಂಬಾಚೆನ್ನಾಗಿದೆ,English- Kannada,Positive,English- Hindi,English- Kannada,English-Malayalam,unk


In [31]:
df_token = SA_df.copy()
df_token['tokens'] = df_token['Text'].str.split()
df_token = df_token.explode('tokens').reset_index(drop=True)
df_token = df_token.rename(columns={'Label': 'Token_Label'})

In [105]:
token_counts = df_token.groupby(['Code-mix-language', 'Token_Label']).size().reset_index(name='count')

stacked_bar_SA = alt.Chart(token_counts).mark_bar().encode(
    x=alt.X('Code-mix-language:N', title='Code-Mix Language', sort='-y'),
    y=alt.Y('count:Q', title='Token Count'),
    color=alt.Color('Token_Label:N', title='Label'),
    tooltip=['Code-mix-language:N', 'Token_Label:N', 'count:Q']
).properties(
    title='Sentiment Dataset Distributiom',
    width=400,
    height=350
)

In [106]:
final_chart = alt.hconcat(stacked_bar_HS, stacked_bar_SA, spacing=40).resolve_scale(color='independent'
).configure_legend(orient='bottom',titleFontSize=14,labelFontSize=12
).configure_view(stroke=None)

final_chart.show()

### Named Entity Recognition

In [108]:
NER_filepath = "../data/Qualitative/NER.csv"
NER_df = pd.read_csv(NER_filepath)
NER_df['NER'] = NER_df['NER'].str.capitalize()
NER_df.head()

Unnamed: 0,Text,NER,LID,xlmr,mdeberta,labse,muril
0,shirt,Product,English,Malayalam,Telugu,Malayalam,English
1,wesi,O,Hindi,Malayalam,English,Malayalam,English
2,hii,O,Hindi,Malayalam,English,Hindi,Hindi
3,thi,O,Hindi,Malayalam,English,Tamil,Telugu
4,jese,O,Hindi,Malayalam,English,Malayalam,English


In [109]:
NER_df['NER'] = NER_df['NER'].str.replace(r'^[BI]-', '', regex=True)
NER_df = NER_df[['Text', 'NER', 'LID']].dropna()
NER_df['NER'] = NER_df['NER'].str.capitalize()

In [110]:
NER_df.iloc[477]

Text    paranjappazha
NER            Person
LID           Kannada
Name: 477, dtype: object

In [111]:
# NER  data
ner_counts = NER_df[NER_df['NER'] != 'O']['NER'].value_counts().reset_index()
ner_counts.columns = ['label', 'count']
ner_counts['percent'] = ner_counts['count'] / ner_counts['count'].sum()
ner_counts['percent_label'] = (ner_counts['percent'] * 100).round(1).astype(str) + '%'

In [124]:
# --- NER donut chart ---
base_ner = alt.Chart(ner_counts).encode(
    theta=alt.Theta('count:Q', stack=True),
    color=alt.Color('label:N', title='NER Label', legend=alt.Legend(orient='bottom')),
    tooltip=[
        alt.Tooltip('label:N'),
        alt.Tooltip('count:Q'),
        alt.Tooltip('percent:Q', format='.1%')
    ]
)

ner_pie = base_ner.mark_arc(innerRadius=50, outerRadius=120)
# ner_text = base_ner.mark_text(radius=200, size=14, color='black').encode(
#     text='percent_label:N'
# )

ner_chart = ner_pie.properties(
    title='NER Distribution (Excl. "O")',
    width=450,
    height=350
)

In [125]:
# LID data
lid_counts = NER_df['LID'].value_counts().reset_index()
lid_counts.columns = ['label', 'count']
lid_counts['percent'] = lid_counts['count'] / lid_counts['count'].sum()
lid_counts['percent_label'] = (lid_counts['percent'] * 100).round(1).astype(str) + '%'

In [126]:
# --- LID donut chart ---
base_lid = alt.Chart(lid_counts).encode(
    theta=alt.Theta('count:Q', stack=True),
    color=alt.Color('label:N', title='Language ID', legend=alt.Legend(orient='bottom')),
    tooltip=[
        alt.Tooltip('label:N'),
        alt.Tooltip('count:Q'),
        alt.Tooltip('percent:Q', format='.1%')
    ]
)

lid_pie = base_lid.mark_arc(innerRadius=50, outerRadius=120)
# lid_text = base_lid.mark_text(radius=150, size=14, color='black').encode(
#     text='percent_label:N'
# )

lid_chart = lid_pie.properties(
    title='Language Identification Label Distribution',
    width=450,
    height=350
)

In [127]:
combined = (ner_chart | lid_chart).resolve_scale(
    color='independent'
).configure_legend(
    orient='bottom',
    columns=4,
    titleFontSize=13,
    labelFontSize=11
).configure_view(
    stroke=None
)

combined.show()