# ***Indian Population Data Analysis***

In [None]:
## Import required libraries
import numpy as np
import pandas as pd
from functools import reduce

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from bokeh.io import output_notebook, show
from bokeh.plotting import figure

pd.set_option('display.max_columns',25)

In [None]:
output_notebook()  ## Using this all calls to the function show() will be displayed inline in the notebook

#### **``Loading CSV files of Total Population, CBR, DR, FR, IMR, LER, Rural & Urban Pop``**

#### **``1.Total Population Data``**

In [None]:
files_location = "../input/indian-population-stats-for-data-analysis/"

In [None]:
total_pop_df = pd.read_csv(files_location+"india-population-2020-06-22.csv",skiprows=15)
total_pop_df.head()

#### **``2. Crude Birth Rate Data``**

In [None]:
cbr_df = pd.read_csv(files_location+"india-population-cbr.csv")
cbr_df.head()

#### **``3. Death Rate Data``**

In [None]:
dr_df = pd.read_csv(files_location+"india-population-death_rate.csv",skiprows=15)
dr_df.head()

#### **``4. Fertility Rate Data``**

In [None]:
fert_df = pd.read_csv(files_location+"india-population-fertitltyrate.csv",skiprows=15)
fert_df.head()

#### **``5. Infant Mortality Rate Data``**

In [None]:
infant_mort_df = pd.read_csv(files_location+"india-population-infantmr.csv",skiprows=15)
infant_mort_df.head()

#### **``6. Life Expectancy Rate Data``**

In [None]:
life_expect_df = pd.read_csv(files_location+"india-population-lifeexp.csv",skiprows=15)
life_expect_df.head()

#### **``7. Rural Population Data``**

In [None]:
rural_pop_df = pd.read_csv(files_location+"india-rural-population.csv",skiprows=16)
rural_pop_df.head()

#### **``8. Urban Population Data``**

In [None]:
urban_pop_df = pd.read_csv(files_location+"india-urban-population.csv",skiprows=16)
urban_pop_df.head()

#### **``9. Suicide Rate Data``**

In [None]:
suicide_rate_df = pd.read_csv(files_location+"india-suicide-rate.csv",skiprows=16)
suicide_rate_df

### **Dropping Duplicates from Suicide Rate Data**

In [None]:
suicide_rate_df.drop_duplicates(subset='date',keep='first',inplace=True)
suicide_rate_df

### **Data Pre-processing**

In [None]:
total_pop_df.info()

In [None]:
cbr_df.info()

In [None]:
dr_df.info()

In [None]:
fert_df.info()

In [None]:
infant_mort_df.info()

In [None]:
life_expect_df.info()

In [None]:
rural_pop_df.info()

In [None]:
urban_pop_df.info()

In [None]:
suicide_rate_df.info()

#### **``Few things to joot down here:``**
##### **1. In every dataframe 'date' column is of string datatype which needs to be converted to a datatime column.**
##### **2. The other two columns in every dataframe contains an unwanted space in their names which needs to be removed.**
##### **3. Capitalize the column names to provide the uniformity.**
##### **4. Rural and Urban Population dataframes are having less number of records as compare to others(investigate it later, these might not be having the post 2019 prediction records).**
##### **5. For Suicide Rate only 6 years data is available and for 2016 there are multiple records which will be investigated later.**  

In [None]:
total_pop_df.head()

In [None]:
def data_cleaning(d_frame):
    """
    Description: This function is performing below operations:
            1. Columns uniformity 
            2. Filtering the records where year > 2019
            3. Assigning the Date column of every dataframe as Index
    
    Input Parameters: It accepts:
            1. d_frame: DataFrame object 
    """
    d_frame.iloc[:,0] = pd.to_datetime(d_frame.iloc[:,0])
    d_frame.columns = ([str.capitalize(str.strip(col)) for col in d_frame.columns])
    d_frame.where(d_frame.iloc[:,0].dt.year <= 2019,inplace=True)
    d_frame.drop_duplicates(keep=False,inplace=True)
    d_frame.index = d_frame[d_frame.columns[0]]
    d_frame.drop([d_frame.columns[0]],axis=1,inplace=True)

In [None]:
all_data_frames = [total_pop_df,cbr_df,dr_df,fert_df,infant_mort_df,life_expect_df,rural_pop_df,urban_pop_df,suicide_rate_df]
for df in all_data_frames:
    data_cleaning(d_frame=df)

##### **``Now, let's analyse every dataframe one at a time...``**

In [None]:
for df in all_data_frames:
    name =[x for x in globals() if globals()[x] is df][0]
    print('DataFrame :: ',name,'\n')
    df.info()
    print('Starting Date:',df.index.min())
    print('Latest Date:',df.index.max())
    print('\n')

##### **Here, we found out that in Rural and Urban Population datasets records are not available for years 1951 to 1959 and 2019. And, Suicide Rate dataset has very records.**
##### **Now, lets merge all the dataframes**

In [None]:
ind_pop_df = reduce(lambda  left,right: pd.merge(left,right,on=['Date'],how='outer'), all_data_frames)

In [None]:
ind_pop_df.shape

In [None]:
ind_pop_df.head()

In [None]:
ind_pop_df['Population'] = ind_pop_df['Population'].apply(lambda val: np.divide(val,10000000,dtype=np.float))

In [None]:
ind_pop_df.columns = ['Population','Pop_annual_change','Birth_Rate','BR_annual_change','Death_Rate','DR_annual_change','Fert_Rate','FR_annual_change',
                     'Inf_Mort_Rate','Inf_Mort_annual_change','Life_Exp_Rate','Life_Exp_annual_change',
                      'Rural_Pop','Rural_Percent_of_Total_Pop','Rural_Pop_change','Urban_Pop','Urban_Percent_of_Total_Pop','Urban_Pop_change',
                      'SR_Total','SR_Male','SR_Female']

ind_pop_df.reset_index(inplace=True)
ind_pop_df['Year'] = ind_pop_df['Date'].dt.year

In [None]:
ind_pop_df.head(10)

## ***``Exploratory Data Analysis``***

In [None]:
def graph_customization(label_x, label_y, grph_title, x_ticks_rot=0, ticks_size=11, y_ticks_rot=10, x_ticks=False, x_ticks_labels=False, y_ticks=False, y_ticks_labels=False):
    """
    Description: This function is created for customizing the labels, title and axis ticks.
    
    Input Parameters: It accepts below input parameters:
            1. label_x : X-axis label
            2. label_y : Y-axis label
            3. grph_title : Title of the graph
            4. x_ticks_rot : X-ticks rotation value; by default = 0 
            5. ticks_size : X-axis and Y-axis ticks size; by default = 11 
            6. y_ticks_rot : Y-axis ticks rotation; by default = 10 
            7. x_ticks : Locations of X-axis ticks; by default = False
            8. x_ticks_labels : Labels of the X-axis ticks; by default = False
            9. y_ticks : Locations of Y-axis ticks; by default = False
            10. y_ticks_labels : Labels of the Y-axis ticks; by default = False
    
    Returns : None
    """
    label_font_style = {'size':18,'family':'calibri','color':'coral','style':'italic'}
    title_font_style = {'size':20,'family':'calibri','color':'blue','style':'italic'}
    
    plt.xlabel(label_x,fontdict=label_font_style)
    plt.ylabel(label_y,fontdict=label_font_style)
    plt.title(grph_title,fontdict=title_font_style)
    
    if x_ticks != False:
        plt.xticks(ticks=x_ticks, labels= x_tick_labels, rotation=x_ticks_rot, size=ticks_size, style='oblique')
    else:
        plt.xticks(rotation=x_ticks_rot,size=ticks_size,style='oblique')
    
    if y_ticks != False:
        plt.yticks(ticks=y_ticks, labels= y_tick_labels, rotation=y_ticks_rot, size=ticks_size, style='oblique')
    else:
        plt.yticks(rotation=y_ticks_rot,size=ticks_size,style='oblique')

### **``Plot-1``**
#### **``Let's try to visualize the trend in population growth. Is it very steap in any decade between 1950 to 2019?``**

In [None]:
sns.set_context('poster')
plt.figure(figsize=(20,8))
sns.pointplot(x='Year',y='Population',data=ind_pop_df,palette=sns.color_palette('gist_rainbow',2),scale=0.65,markers='*')
graph_customization('Years','Population in Crores','Growth in population from year 1950 to 2019',x_ticks_rot=75)
plt.show()

##### **So, it is quite evident that India's population has followed the gradual increase trend from 1950 to 2019.**

### **``Plot-2``**
#### **``Let's see whether Annual Population Change(%) will show us any different trend in population growth.``**

In [None]:
with plt.style.context('seaborn'):
    plt.figure(figsize=(18,10))
    sns.barplot(x='Year',y='Pop_annual_change',data=ind_pop_df,palette=sns.color_palette('magma'))
    graph_customization('Years','Population change (%)','Percentage change in population from year 1950 to 2019',ticks_size=10,x_ticks_rot=60)
    plt.ylim(bottom=0, top=2.5)
plt.show()

##### **So, here things are not very straight forward till year 1967 the increase was gradual, however, between 1968 to 1985 there are few ups & down. And, the annual pop change reached its highest value twice.**

### **``Plot-3``**
#### **``Let's see the trend of Birth Rate b/w 1950 & 2019.``**

In [None]:
sns.set_context('poster')
plt.figure(figsize=(20,8))
sns.barplot(x='Year',y='Birth_Rate',data=ind_pop_df,palette=sns.color_palette('PuBu'))
graph_customization('Years','Birth Rate','Trend of Birth Rate from 1950 to 2019',ticks_size=11,x_ticks_rot=75)
plt.show()

##### **So, it is quite evident that number of births per thousand people have been decreased significantly in the 70 years span.**

### **``Plot-4``**
#### **``Let's see the trend of Annual percentage change in Birth Rate b/w 1950 & 2019.``**

In [None]:
sns.set_context('poster')
plt.figure(figsize=(10,8))
sns.lineplot(x='Year',y='BR_annual_change',data=ind_pop_df,palette=sns.color_palette('twilight'))
graph_customization('Years','Birth Rate Annual change (%)','Birth Rate (%) change b/w 1950 & 2019',x_ticks_rot=40,ticks_size=11)
plt.ylim(bottom=-4,top=2)
plt.show()

##### **Here, it is clear that Birth Rate Annual Change has fluctuate significantly from 1950 to 2019.**

### **``Plot-5``**
#### **``Let's try to see the Annual Pop Change and Birth Rate Annual Change relationship b/w 1950 & 2019.``**

In [None]:
with plt.style.context('seaborn'):
    g = sns.jointplot(x='Pop_annual_change',y='BR_annual_change',data=ind_pop_df,kind='kde',color='k',height=8.5)
    g.plot_joint(plt.scatter, c="r", s=20, linewidth=1, marker="^")
    g.set_axis_labels('Population Annual (%) change','Birth Rate Annual (%) change',style='oblique',size=15,color='coral')

##### **So, here we found out that majority of the data points for annual (%) change. By seeing the data points in triangular shape its pretty hard to find the linear relationship b/w the variables.** 

In [None]:
ind_pop_df[['Pop_annual_change','BR_annual_change']].corr().applymap(lambda val : np.square(val))

##### **As suggested by the plot only 31% of the variations in these variables are explainable among them.**

### **``Plot-6``**
#### **``Let's see the trend of Death Rate b/w 1950 & 2019.``**

In [None]:
sns.set_context('poster')
plt.figure(figsize=(18,8))
sns.barplot(x='Year',y='Death_Rate',data=ind_pop_df,palette=sns.color_palette('viridis'))
sns.set(rc={"xtick.bottom" : True, "ytick.left" : True})
graph_customization(label_x='Years',label_y='Death Rate',grph_title='Trend of Death Rate from 1950 to 2019',x_ticks_rot=75)
plt.show()

##### **There is a definite a dip in the death rate till year 2011, post that it appears to be flatten out.**

### **``Plot-7``**
#### **``Let's see the trend of Annual percentage change in Death Rate b/w 1950 & 2019.``**

In [None]:
sns.set_context('poster')
plt.figure(figsize=(10,8))
sns.lineplot(x='Year',y='DR_annual_change',data=ind_pop_df,palette=sns.color_palette('twilight'))
graph_customization('Years','Death Rate Annual change (%)','Death Rate (%) change b/w 1950 & 2019',x_ticks_rot=40,ticks_size=11)
plt.ylim(bottom=-4,top=2)
plt.show()

##### **Annual (%) change in Death rate has fluctuated throughout the time period. And, we can say that post 2010 we have witnessed the continuous progression.**

### **``Plot-8``**
#### **``Let's try to visualize the relationship b/w Birth Rate and Death Rate b/w 1950 & 2019.``**

In [None]:
with plt.style.context('seaborn'):
    sns.violinplot(x='Birth_Rate',data=ind_pop_df,palette=sns.color_palette('plasma'))
    graph_customization(label_x='Birth Rate',label_y=None, grph_title='Distribution of Birth Rate')
plt.show()

with plt.style.context('seaborn'):
    sns.violinplot(x='Death_Rate',data=ind_pop_df,palette=sns.color_palette('cubehelix'))
    graph_customization(label_x='Death Rate',label_y=None, grph_title='Distribution of Death Rate')
plt.show()

In [None]:
with plt.style.context('seaborn'):
    plt.figure(figsize=(18,8))
    sns.regplot(x='Birth_Rate',y='Death_Rate',data=ind_pop_df,marker='v',color='k')
    graph_customization(label_x='Birth Rate',label_y='Death Rate',grph_title='Relationship b/w Birth & Death Rate',ticks_size=11)
plt.show()

##### **By seeing the data points in triangular shape we can say that a strong linear relationship exists among Birth and Death Rates.** 

In [None]:
ind_pop_df[['Birth_Rate','Death_Rate']].corr().applymap(lambda val : np.square(val))

##### **As suggested by the plot, 83% of the variations in these features are explainable among them.** 

### **``Plot-9``**
#### **``Let's try to visualize the relationship b/w Birth Rate and Death Rate (Annual % change) from 1950 to 2019.``**

In [None]:
with plt.style.context('seaborn'):
    sns.violinplot(x='BR_annual_change',data=ind_pop_df,palette=sns.color_palette('gist_rainbow'))
    graph_customization(label_x='Birth Rate (%)',label_y=None, grph_title='Distribution of Birth Rate Annual (%) change')
plt.show()

with plt.style.context('seaborn'):
    sns.violinplot(x='DR_annual_change',data=ind_pop_df,palette=sns.color_palette('viridis'))
    graph_customization(label_x='Death Rate (%)',label_y=None, grph_title='Distribution of Death Rate Annual (%) change')
plt.show()

In [None]:
with plt.style.context('seaborn'):
    img_c2 = sns.jointplot(x='BR_annual_change',y='DR_annual_change',data=ind_pop_df,kind='kde',height=9,color='g')
    img_c2.plot_joint(plt.scatter, c="r", s=25, linewidth=1.5, marker="*")
    img_c2.set_axis_labels('Birth Rate Annual (%) change','Death Rate Annual (%) change',color='coral',size=16,style='oblique')

##### **By seeing the data points in highlighted(star) shape, it feels difficult to find the linear relationship exists among Birth and Death Rates Annual (%) change.** 

In [None]:
ind_pop_df[['BR_annual_change','DR_annual_change']].corr().applymap(lambda val : np.square(val))

##### **As suggested by the plot, only 19% of the variations in these features are explainable among them.** 

### **``Plot-10``**
#### **``Let's see the trend of Fertility Rate from 1950 to 2019.``**

In [None]:
sns.set_context('poster')
plt.figure(figsize=(20,8))
sns.pointplot(x='Year',y='Fert_Rate',data=ind_pop_df,palette=sns.color_palette('magma'),markers='^')
graph_customization(label_x='Years',label_y='Fertility Rate',grph_title='Fertility Rate trend from 1950 to 2019',x_ticks_rot=75,ticks_size=11)
plt.show()

##### **Here, we have witnessed that Fertility Rate remained constant for the first 14 years(some investigation can be done to validate the same). And, it is quite clear that Fertility Rate has been dropped significantly in the span of 70 years.**

##### **Let's see the trend of both Birth and Fertility Rates together:**

In [None]:
sns.set_context('poster')
plt.figure(figsize=(17,8))
sns.lineplot(x='Year',y='Fert_Rate',data=ind_pop_df,palette=sns.color_palette('magma'),markers='o')
sns.lineplot(x='Year',y='Birth_Rate', data=ind_pop_df,palette=sns.color_palette('gist_rainbow'),markers='*')
graph_customization(label_x='Years',label_y='Trend of Birth and Fertility Rate',grph_title='How Birth and Fertility Rates have behaved?',x_ticks_rot=5,ticks_size=11)
plt.legend(('Fertility Rate','Birth Rate'))
plt.show()

##### **So, Birth Rate has been dropped by more than 20 points in 70 years, similar downwards movement(with small points because it applies only for females in the reproductive age) also exists in Fertility Rate.**

### **``Plot-11``**
#### **``Let's visualize the relationship of Birth Rate and Death Rate with Fertility Rate?``**

In [None]:
with plt.style.context('seaborn'):
    plt.figure(figsize=(16,8))
    sns.regplot(x='Birth_Rate',y='Fert_Rate',data=ind_pop_df,color='k',marker='>')
    graph_customization(label_x='Birth Rate', label_y='Fertility Rate',grph_title='Relationship b/w Birth Rate and Fertility Rate',ticks_size=10)

##### **Clearly, they are highly correlated with each other.**

In [None]:
ind_pop_df[['Birth_Rate','Fert_Rate']].corr().applymap(lambda val : np.square(val))

##### **As evident by the Linear Regression Plot, 99% of variations in Birth and Fertility Rate are explainable among them.**

In [None]:
with plt.style.context('seaborn'):
    plt.figure(figsize=(16,8))
    sns.regplot(x='Death_Rate',y='Fert_Rate',data=ind_pop_df,color='k',marker='>')
    graph_customization(label_x='Death Rate', label_y='Fertility Rate',grph_title='Relationship b/w Death Rate and Fertility Rate',ticks_size=10)

##### **These two also seems to be correlated with each other.**

In [None]:
ind_pop_df[['Death_Rate','Fert_Rate']].corr().applymap(lambda val : np.square(val))

##### **So, approx 83% of variations in Death and Fertility Rate are explainable among them.**

### **``Plot-12``**
#### **``Let's visualize the relationship of Annual Birth Rate Change and Annual Fertility Rate Change?``**

In [None]:
with plt.style.context('seaborn'):
    plt.figure(figsize=(20,8))
    sns.pointplot(x='BR_annual_change',y='FR_annual_change',data=ind_pop_df,color='g',ci=0)
    graph_customization(label_x='BR Annual Change', label_y='FR Annual Change',grph_title='Relationship b/w BR & FR Annual Change',ticks_size=11,x_ticks_rot=75)

In [None]:
ind_pop_df[['BR_annual_change','FR_annual_change']].corr().applymap(lambda val : np.square(val))

##### **These two are also correlated with each other. And, 81% of variations in BR and FR Annual Change are explainable among them.**

### **``Plot-13``**
#### **``Let's visualize the relationship of Annual Death Rate Change and Annual Fertility Rate Change?``**

In [None]:
with plt.style.context('seaborn'):
    plt.figure(figsize=(20,8))
    sns.pointplot(x='DR_annual_change',y='FR_annual_change',data=ind_pop_df,color='r',ci=0)
    graph_customization(label_x='DR Annual Change', label_y='FR Annual Change',grph_title='Relationship b/w DR & FR Annual Change',ticks_size=11,x_ticks_rot=75)
plt.show()

In [None]:
ind_pop_df[['DR_annual_change','FR_annual_change']].corr().applymap(lambda val : np.square(val))

##### **These two features don't show any relationship among them.**

### **``Plot-14``**
#### **``Let's see the trend of Infant Mortality Rate from 1950 to 2019.``**

In [None]:
with plt.style.context('seaborn'):
    plt.figure(figsize=(20,8))
    sns.pointplot(x='Year',y='Inf_Mort_Rate',data=ind_pop_df,palette=sns.color_palette('twilight'))
    graph_customization(label_x='Year', label_y='Infant Mortality Rate',grph_title='Trend of Infant Mortality Rate from 1950 to 2019',ticks_size=11,x_ticks_rot=75)
plt.show()

In [None]:
with plt.style.context('seaborn'):
    plt.figure(figsize=(20,8))
    sns.regplot(x='Birth_Rate',y='Inf_Mort_Rate',data=ind_pop_df,color='k',ci=0)
    graph_customization(label_x='Birth Rate', label_y='Infant Mortality Rate',grph_title='Relationship b/w Birth Rate and Infant Mortality Rate',ticks_size=11,x_ticks_rot=0)

In [None]:
ind_pop_df[['Birth_Rate','Inf_Mort_Rate']].corr().applymap(lambda val : np.square(val))

In [None]:
with plt.style.context('seaborn'):
    plt.figure(figsize=(20,8))
    sns.regplot(x='Death_Rate',y='Inf_Mort_Rate',data=ind_pop_df,color='k',ci=0)
    graph_customization(label_x='Death Rate', label_y='Infant Mortality Rate',grph_title='Relationship b/w Death Rate and Infant Mortality Rate',ticks_size=11,x_ticks_rot=0)

In [None]:
ind_pop_df[['Death_Rate','Inf_Mort_Rate']].corr().applymap(lambda val : np.square(val))

In [None]:
with plt.style.context('seaborn'):
    plt.figure(figsize=(20,8))
    sns.regplot(x='Fert_Rate',y='Inf_Mort_Rate',data=ind_pop_df,color='k',ci=0)
    graph_customization(label_x='Fertility Rate', label_y='Infant Mortality Rate',grph_title='Relationship b/w Fertility Rate and Infant Mortality Rate',ticks_size=11,x_ticks_rot=0)

In [None]:
ind_pop_df[['Fert_Rate','Inf_Mort_Rate']].corr().applymap(lambda val : np.square(val))

##### **Overall, Infant Mortality Rate has been reduced drastically from 1950 to 2019. And, it is highly correlated with Birth, Death and Infant Mortality Rate.**

### **``Plot-15``**
#### **``Let's see the trend of Infant Mortality Annual Change from 1950 to 2019.``**

In [None]:
with plt.style.context('seaborn'):
    plt.figure(figsize=(20,8))
    sns.lineplot(x='Year',y='Inf_Mort_annual_change',data=ind_pop_df,palette=sns.color_palette('twilight'))
    graph_customization(label_x='Year', label_y='Infant Mortality Annual Change',grph_title='Trend of Infant Mortality Annual Change from 1950 to 2019',ticks_size=11,x_ticks_rot=75)
plt.show()

In [None]:
with plt.style.context('seaborn'):
    plt.figure(figsize=(20,8))
    sns.lineplot(x='FR_annual_change',y='Inf_Mort_annual_change',data=ind_pop_df,palette=sns.color_palette('twilight'))
    graph_customization(label_x='Year', label_y='Infant Mortality Annual Change',grph_title='Trend of Infant Mortality Annual Change from 1950 to 2019',ticks_size=11,x_ticks_rot=75)
plt.show()

## ***Don't forget to upvote this notebook if you like the work..***

### ***Also, feel free to share any improvement ;)***