https://www.kaggle.com/datasets/waqi786/world-languages-dataset

In [20]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [21]:
df=pd.read_csv('languages_dataset.csv')

In [22]:
df.head()

Unnamed: 0,Language,Family,Region,Speakers,Writing System,ISO Code
0,Yoruba,Indo-European,India,76000000,Latin,om
1,Southern Min,Indo-European,India,47000000,Latin,ru
2,Yue (Cantonese),Dravidian,India,75000000,Devanagari,my
3,Spanish,Austronesian,Poland,76900000,Devanagari,it
4,Arabic,Indo-European,Worldwide,76000000,Hangul,ur


In [23]:
df.tail()

Unnamed: 0,Language,Family,Region,Speakers,Writing System,ISO Code
495,Punjabi,Indo-Iranian,India,23000000,Latin,ha
496,Spanish,Dravidian,Worldwide,76900000,Tamil,bn
497,English,Indo-European,"Germany, Austria",28000000,Latin,hak
498,Xiang (Hunanese),Indo-European,Worldwide,460000000,Kannada,pa
499,Punjabi,Indo-European,Indonesia,83000000,Perso-Arabic,de


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Language        500 non-null    object
 1   Family          500 non-null    object
 2   Region          500 non-null    object
 3   Speakers        500 non-null    int64 
 4   Writing System  500 non-null    object
 5   ISO Code        489 non-null    object
dtypes: int64(1), object(5)
memory usage: 23.6+ KB


In [25]:
df.columns

Index(['Language', 'Family', 'Region', 'Speakers', 'Writing System',
       'ISO Code'],
      dtype='object')

In [26]:
df.describe()

Unnamed: 0,Speakers
count,500.0
mean,98402000.0
std,141489300.0
min,20000000.0
25%,28000000.0
50%,53500000.0
75%,76900000.0
max,918000000.0


In [27]:
family_power = (
    df.groupby("Family")["Speakers"]
    .sum()
    .sort_values(ascending=False)
    .reset_index()
)

family_power

Unnamed: 0,Family,Speakers
0,Indo-European,20953500000
1,Sino-Tibetan,9129900000
2,Turkic,3376000000
3,Afro-Asiatic,2718000000
4,Dravidian,2629900000
5,Niger-Congo,2413000000
6,Austroasiatic,2065000000
7,Indo-Iranian,1995900000
8,Austronesian,1628800000
9,Japonic,860000000


In [28]:
df["Speakers"] = pd.to_numeric(df["Speakers"], errors="coerce")
df = df.dropna(subset=["Speakers"])
df = df.sort_values("Speakers", ascending=False)
df["Global Rank"] = range(1, len(df)+1)
df.head(10)

Unnamed: 0,Language,Family,Region,Speakers,Writing System,ISO Code,Global Rank
379,Maithili,Sino-Tibetan,India,918000000,Tamil,de,1
362,Hausa,Sino-Tibetan,"Germany, Austria",918000000,Ethiopic,jv,2
153,Yoruba,Indo-Iranian,Worldwide,918000000,Latin,uk,3
494,Kannada,Indo-European,Nigeria,918000000,Latin,pt,4
46,Xiang (Hunanese),Sino-Tibetan,China (Hong Kong),918000000,Latin,mr,5
440,Hausa,Indo-European,China,918000000,Telugu,ko,6
161,Xiang (Hunanese),Austroasiatic,India,918000000,Latin,uz,7
251,Korean,Indo-European,"Pakistan, India",918000000,Latin,am,8
138,Hindi,Austronesian,India,460000000,Latin,ff,9
190,Maithili,Japonic,India,460000000,Oriya,hsn,10


In [29]:
df_geo = df.copy()
df_geo["Region"] = df_geo["Region"].str.split(",")
df_geo = df_geo.explode("Region")
df_geo["Region"] = df_geo["Region"].str.strip()

In [30]:
spread = (
    df_geo.groupby("Language")["Region"]
    .nunique()
    .sort_values(ascending=False)
    .reset_index(name="Country Count")
)

spread.head(10)

Unnamed: 0,Language,Country Count
0,Marathi,14
1,Persian,14
2,Vietnamese,13
3,Odia (Oriya),11
4,Awadhi,11
5,Burmese,11
6,Hausa,11
7,Xiang (Hunanese),10
8,Bengali,10
9,Uzbek,10


In [31]:
merged = df.merge(spread, on="Language")

merged["Density Index"] = (
    merged["Speakers"] /
    merged["Country Count"]
)

merged.sort_values("Density Index", ascending=False).head(10)

Unnamed: 0,Language,Family,Region,Speakers,Writing System,ISO Code,Global Rank,Country Count,Density Index
0,Maithili,Sino-Tibetan,India,918000000,Tamil,de,1,7,131142900.0
55,Korean,Indo-European,"Pakistan, India",918000000,Latin,am,8,7,131142900.0
22,Yoruba,Indo-Iranian,Worldwide,918000000,Latin,uk,3,8,114750000.0
33,Kannada,Indo-European,Nigeria,918000000,Latin,pt,4,9,102000000.0
103,Oromo,Turkic,"Pakistan, India",460000000,Malayalam,ps,15,5,92000000.0
42,Xiang (Hunanese),Sino-Tibetan,China (Hong Kong),918000000,Latin,mr,5,10,91800000.0
43,Xiang (Hunanese),Austroasiatic,India,918000000,Latin,uz,7,10,91800000.0
10,Hausa,Sino-Tibetan,"Germany, Austria",918000000,Ethiopic,jv,2,11,83454550.0
11,Hausa,Indo-European,China,918000000,Telugu,ko,6,11,83454550.0
82,Fula,Koreanic,Nigeria,460000000,Simplified Chinese,yue,13,6,76666670.0


In [32]:
writing_power = (
    df.groupby("Writing System")["Speakers"]
    .sum()
    .sort_values(ascending=False)
    .reset_index()
)

writing_power

Unnamed: 0,Writing System,Speakers
0,Latin,21429600000
1,Devanagari,2858800000
2,Tamil,2785900000
3,Ethiopic,2191000000
4,Chinese,2164000000
5,Telugu,1668000000
6,Traditional Chinese,1642000000
7,Perso-Arabic,1637800000
8,Arabic,1563000000
9,Simplified Chinese,1515000000


In [33]:
family_diversity = (
    df.groupby("Family")["Language"]
    .count()
    .sort_values(ascending=False)
    .reset_index(name="Language Count")
)

family_diversity

Unnamed: 0,Family,Language Count
0,Indo-European,228
1,Sino-Tibetan,76
2,Dravidian,36
3,Turkic,34
4,Afro-Asiatic,26
5,Austronesian,24
6,Niger-Congo,24
7,Indo-Iranian,16
8,Kra-Dai,12
9,Austroasiatic,10


In [34]:
family_analysis = family_diversity.merge(
    family_power[["Family", "Speakers"]],
    on="Family"
)

family_analysis["Avg Speakers per Language"] = (
    family_analysis["Speakers"] /
    family_analysis["Language Count"]
)

family_analysis.sort_values("Avg Speakers per Language", ascending=False)

Unnamed: 0,Family,Language Count,Speakers,Avg Speakers per Language
9,Austroasiatic,10,2065000000,206500000.0
11,Japonic,6,860000000,143333300.0
7,Indo-Iranian,16,1995900000,124743800.0
1,Sino-Tibetan,76,9129900000,120130300.0
4,Afro-Asiatic,26,2718000000,104538500.0
6,Niger-Congo,24,2413000000,100541700.0
3,Turkic,34,3376000000,99294120.0
0,Indo-European,228,20953500000,91901320.0
10,Koreanic,8,724000000,90500000.0
2,Dravidian,36,2629900000,73052780.0


In [35]:
script_efficiency = (
    df.groupby("Writing System")
    .agg({
        "Language": "count",
        "Speakers": "sum"
    })
    .reset_index()
)

script_efficiency["Speakers per Language"] = (
    script_efficiency["Speakers"] /
    script_efficiency["Language"]
)

script_efficiency.sort_values("Speakers per Language", ascending=False)

Unnamed: 0,Writing System,Language,Speakers,Speakers per Language
19,Telugu,8,1668000000,208500000.0
18,Tamil,16,2785900000,174118800.0
9,"Kanji, Hiragana, Katakana",10,1381000000,138100000.0
7,Gujarati,5,671000000,134200000.0
10,Kannada,6,741000000,123500000.0
13,Oriya,8,972900000,121612500.0
6,Ethiopic,19,2191000000,115315800.0
11,Latin,203,21429600000,105564500.0
3,Chinese,21,2164000000,103047600.0
5,Devanagari,28,2858800000,102100000.0


In [36]:
import plotly.express as px

In [37]:
fig = px.treemap(
    df,
    path=["Family", "Writing System", "Language", "Region"],
    values="Speakers",
    title="Global Linguistic Power Structure: Multi-Layer Treemap",
)

fig.update_layout(
    margin=dict(t=50, l=0, r=0, b=0)
)

fig.show()