Library imports
---------------

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap

# Any results you write to the current directory are saved as output.


Preprocessing of data
---------------------

Removing all irrelevant columns and label encoding the categorical values

In [None]:
data = pd.read_csv('../input/data.csv')
data.drop(data.columns[[0, 1, 2, 3, 5, 6, 8, 9, 11, 14]], axis=1, inplace=True)
data.rename(columns={'Number of speakers': 'Speakers', 
                     'Degree of endangerment': 'Endangerment'}, inplace=True)
labels = ['Vulnerable' ,'Definitely endangered' , 'Severely endangered', 
          'Critically endangered', 'Extinct']
label_to_enc = {'Vulnerable':0 ,'Definitely endangered':1 , 'Severely endangered':2, 
                'Critically endangered':3, 'Extinct':4}
enc_to_label = {value: key for key, value in label_to_enc.items()}
data.Endangerment.replace(label_to_enc, inplace=True)
data.Speakers.replace({np.nan:0}, inplace=True)

# data preview
data.tail()

Number of languages and mean speakers vs the degree of endangerment
-----------------------------------------------------------------------
Extinct languages should ideally have zero speakers which is shown by the graph as the average number of  extinct speakers is zero.

In [None]:
index = np.arange(len(labels))
bar_width = 0.35
language = []
speaker = []
for i in range(len(labels)):
	language.append(len(data[data.Endangerment==i]))
	speaker.append(np.mean(data[data.Endangerment==i].Speakers)/100)
plt.figure()
plt.bar(index, language, bar_width, align='center', 
        label='Total languages', color='yellow')
plt.bar(index+bar_width, speaker, bar_width, align='center', 
        label='Mean Speakers(in 100s)', color='red')
plt.xticks(index, labels, rotation=45)
plt.tight_layout()
plt.title('Degree of Endangerment')
plt.legend(loc=1)

Countries vs Number of Speakers for Endangered Languages
--------------------------------------------------------

In [None]:
countries_list = ['Nepal', 'China', 'Brazil', 'Russia', 
                  'Indonesia', 'United States of America']
colors = ["lightgreen", "yellow", "orange", "red", "maroon"]
countries = [0]*len(countries_list)

# Number of extinct speakers is zero for most of the countries
danger_countries = [countries]*(len(labels)-1) 

index = np.arange(len(countries_list))
for danger in range(len(labels)-1):
	for country in range(len(countries_list)):
		danger_countries[danger][country] = \
        sum(data[(data.Countries.str.contains(countries_list[country])==True) 
                 & (data.Endangerment==danger)].Speakers.astype(float))

        
# Plotting the graph
plt.figure()
plt.bar(index, danger_countries[0], bar_width, label=enc_to_label[0], 
        align='center', color=colors[0])

for i in range(1, len(danger_countries)):
	bottom = [sum(k) for k in zip(*danger_countries[0:i][:])] 
	plt.bar(index, danger_countries[i], bar_width, label=enc_to_label[i], 
            align='center', color=colors[i], bottom=bottom)

plt.xticks(index, countries_list, rotation=20)
plt.xlabel('Countries')
plt.ylabel('Number of speakers')
plt.legend(bbox_to_anchor=(1.1, 1.1))
plt.tight_layout()

Degree of Endangerment of Languages over the world
--------------------------------------------------

In [None]:
plt.figure(figsize=(15,8))
m = Basemap(projection='mill', llcrnrlat = -80, urcrnrlat = 80, 
            llcrnrlon = -180, urcrnrlon = 180, resolution = 'h')
m.drawcoastlines()
m.drawcountries()
m.drawmapboundary()
l = [0]*5
for i in range(5):
	x, y = m(list(data[(data.Endangerment==i)].Longitude.astype(float)), 
             list(data[(data.Endangerment==i)].Latitude.astype(float)))
	l[i], = m.plot(x, y, 'go', markersize=8, alpha=0.6, color = colors[i], 
                   label=enc_to_label[i])
plt.legend(handles=l, loc='best')
plt.title('Degree of Endangerment of Languages over the world')

Degree of Endangerment of Languages over India
----------------------------------------------

In [None]:
plt.figure(figsize=(15,8))
m = Basemap(projection='mill', llcrnrlat = 5, urcrnrlat = 40, 
            llcrnrlon = 65, urcrnrlon = 100, resolution = 'h')
m.drawcoastlines()
m.drawcountries()
m.drawmapboundary()
m.drawstates()
for i in range(5):
	x, y = m(list(data[(data.Countries.str.contains('India')==True) 
                       & (data.Endangerment==i)].Longitude.astype(float)), 
             list(data[(data.Countries.str.contains('India')==True) 
                       & (data.Endangerment==i)].Latitude.astype(float)))
	l[i], = m.plot(x, y, 'go', markersize=8, alpha=0.6, color = colors[i], 
                   label=enc_to_label[i])
plt.legend(handles=l, loc='best')
plt.title('Degree of Endangerment of Languages over India')
