In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import numpy as np
import pandas as pd
df = pd.read_csv('/kaggle/input/russian-demography/russian_demography.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.dtypes

In [None]:
df['region'].value_counts()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df.head()

In [None]:
df.shape

In [None]:
##Let's check average birth and death rates over the years

In [None]:
df.groupby(['year']).agg({'death_rate':'mean'}).plot.bar(figsize=(20,10))
plt.show()

In [None]:
df.groupby(['year']).agg({'birth_rate':'mean'}).plot.bar(figsize=(20,10))
plt.show()

In [None]:
df['rate_diff'] = df['birth_rate'] - df['death_rate']

In [None]:
##Plotting the difference in the birth and death rate, which will show us an actual increase/decrease in the population
#over the years

In [None]:
df.groupby(['year']).agg({'rate_diff':'mean'}).plot.bar(figsize=(20,10))
plt.show()

In [None]:
#After 1991, Russia's population has been on a huge decline.
##From 2012 to 2016 it started increasing a bit, but 2017 showed a decrease yet again

In [None]:
df.head()

In [None]:
#Now let's check the natural population growth per 1000 people

In [None]:
df.groupby(['year']).agg({'npg':'mean'}).plot.bar(figsize=(20,10))
plt.show()

In [None]:
##So, npg is same as rate_diff

In [None]:
#Let's check the same stats filtered by region

In [None]:
df.groupby(['region']).agg({'npg':'mean'}).plot.bar(figsize=(20,10))
plt.show()

In [None]:
df.groupby(['region']).agg({'npg':'mean'}).sort_values(by='npg',ascending=False).plot.bar(figsize=(20,10))

In [None]:
##We can where the Population has increased in Russia, and where the population has decreased exteremely
#In Chenchen Republic, the population has increased extensively, whereas in Pskov Oblast, it has decreased exteremely
#A few places like Kamchakta Krai have had almost no effect of the change in population

In [None]:
df.head()

In [None]:
df.groupby(['region']).agg({'migratory_growth':'mean'}).sort_values(by='migratory_growth',ascending=False).plot.bar(figsize=(20,10))

In [None]:
df['migratory_growth'].isna().sum()/df.shape[0]

In [None]:
##Since migratory has very few null values, let's check for all variables

In [None]:
df.isnull().sum()/df.shape[0]

In [None]:
#Only migratory growth has so many nulls
#let's check how migratory growth is related to npg, then we can take a decision, to impute or remove nulls

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(df.corr(),annot=True)

In [None]:
df = df.drop(columns='rate_diff')

In [None]:
##Death rate and NPG have high negative correlation, and that is perfectly valid

In [None]:
plt.figure(figsize=(20,10))
sns.scatterplot(data=df,x='npg',y='migratory_growth')

In [None]:
##As per the above, we can actually impute migratory growth with the some value

In [None]:
df['migratory_growth'].median()

In [None]:
#Median imputation seems fine for now

In [None]:
df['migratory_growth'].fillna(df['migratory_growth'].median(),inplace=True)

In [None]:
#Checking plots again

In [None]:
df.groupby(['region']).agg({'migratory_growth':'mean'}).sort_values(by='migratory_growth',ascending=False).plot.bar(figsize=(20,10))

In [None]:
##We see that the plot has changed, and it does show some obvious truths, because we understand that it is quite normal
#for people to shift to Moscow, it being the capital

In [None]:
plt.figure(figsize=(20,10))
sns.scatterplot(data=df,x='npg',y='migratory_growth')

In [None]:
##Scatter plot still looks almost the same, which means the pattern was preserved

In [None]:
#Let's check nulls again

In [None]:
df.isnull().sum()

In [None]:
##Also, we could one more thing, since the data is based on time

In [None]:
#We can impute using methods such as forward fill, so we will be doing that now

In [None]:
df = pd.read_csv('/kaggle/input/russian-demography/russian_demography.csv')

In [None]:
df.isnull().sum()

In [None]:
df.fillna(method='ffill',inplace=True)

In [None]:
df.isnull().sum()

In [None]:
##We will have to backfill for migratory_growth, because it starts with null itself

In [None]:
df = pd.read_csv('/kaggle/input/russian-demography/russian_demography.csv')

In [None]:
df.isnull().sum()

In [None]:
df['migratory_growth'].fillna(method='bfill',inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.fillna(method='ffill',inplace=True)

In [None]:
##Now, our data should be good enough for initial analysis

In [None]:
df.head()

In [None]:
##We can do one more thing, that is fill null values as per the regions. We will try that a bit later

In [None]:
df.groupby(['region']).agg({'npg':'mean'}).sort_values(by='npg',ascending=False).plot.bar(figsize=(20,10))

In [None]:
##Now,this clearly shows that a few regions have been extensively occupied by people, and people have probably migrated from
#other regions, and isn't that very obvious in today's World?

In [None]:
##Let's check what happened over the years

In [None]:
df.groupby(['year']).agg({'npg':'mean'}).plot.bar(figsize=(20,10))
plt.show()

In [None]:
##So, it is true that Russia's population has been on a decline lately

In [None]:
##Let's use a pairplot, to try to find out what effects what!

In [None]:
sns.pairplot(data=df)

In [None]:
##We can see that migratory growth is basically a constant for most part of it, and quite obviously,
#npg is positively correlated to birth_rate and negatively to the death_rate