# **Famous Indian Women**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import math
import datetime
from collections import Counter
from collections import OrderedDict

warnings.filterwarnings('ignore')
sns.set_palette('Set2')
sns.set_style('darkgrid')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## **Load the Dataset**

In [None]:
df = pd.read_csv("/kaggle/input/7k-indian-famous-women/data.csv")
print (df.shape)
df.head()

## **NA values in each column**

In [None]:
df_NA = pd.DataFrame(data=[df.isna().sum().tolist(), ["{:.2f}".format(i)+'%' for i in (df.isna().sum()/df.shape[0]*100).tolist()]], 
            columns=df.columns, index=['NA Count', 'NA Percent']).transpose()
df_NA.style.background_gradient(cmap="summer", subset=['NA Count'])

## **Drop the irrelevant columns with high NA %**

In [None]:
df.drop(columns = ['Education Place', 'Native Language', 'Father', 'Mother', 'Spouse'], inplace=True)

## **Create a new feature - AGE** 

In [None]:
df['Birth Date'] = pd.to_datetime(df['Birth Date'], errors = 'coerce')
df['Death Date'] = pd.to_datetime(df['Death Date'], errors = 'coerce')
df['Age'] = np.where(df['Death Date'].isna(),datetime.datetime.now().date() - df['Birth Date'].dt.date, df['Death Date'].dt.date - df['Birth Date'].dt.date)
df['Age'] = round(df['Age']/np.timedelta64(1, 'Y'))

# **EDA**

## **Women in differnt Job Roles**

In [None]:
plt.rcParams['font.size'] = 12
plt.figure(figsize=(14,8))
sns.countplot(y='Job', data=df, order=df['Job'].value_counts().index[:20])
plt.xlabel('Number of Women', weight='bold', fontsize=16)
plt.ylabel('Job', weight='bold', fontsize=16)
plt.show()

## **Women with different Job Description**

In [None]:
dict_ = Counter(df['Description'].value_counts().to_dict())
dict_ = dict(dict_.most_common(5))

plt.figure(figsize=(8,8))
plt.pie(x=dict_.values(), labels=dict_.keys(), autopct='%1.1f%%', shadow=True, startangle=90)
plt.show()

## **Women born in different Countries**

In [None]:
plt.rcParams['font.size'] = 12
plt.figure(figsize=(14,8))
sns.countplot(y='Country', data=df, order=df['Country'].value_counts().index[:20])
plt.xlabel('Number of Women', weight='bold', fontsize=16)
plt.ylabel('Country', weight='bold', fontsize=16)
plt.xscale('log')
plt.show()

## **Women born across different timelines**

In [None]:
dict_ = df['Birth Date'].dt.year.value_counts().to_dict()
dict_ = dict(OrderedDict(sorted(dict_.items())))

plt.figure(figsize=(20,8))
plt.bar(dict_.keys(), dict_.values())
plt.xlabel('Year', weight='bold', fontsize=16)
plt.ylabel('Number of Women', weight='bold', fontsize=16)
plt.show()

In [None]:
dict_ = Counter(df['Birth Date'].dt.year.value_counts().to_dict())
dict_ = dict(dict_.most_common(10))

plt.figure(figsize=(8,8))
plt.pie(x=dict_.values(), labels=dict_.keys(), autopct='%1.1f%%', shadow=True, startangle=90)
plt.show()

## **Age of Women across different Job Roles**

In [None]:
plt.figure(figsize=(14,6))
sns.boxplot(y='Age', x='Job', data=df, order=df['Job'].value_counts().index[:15], palette="Set2")
plt.xticks(rotation=90)
plt.xlabel("Job", weight='bold', fontsize=16)
plt.ylabel("Age", weight='bold', fontsize=16)
plt.show()

## **Deceased Women and their Cause of Death**

In [None]:
df_temp = df[~df['Death Date'].isna()].reset_index(drop=True)
plt.rcParams['font.size'] = 12
plt.figure(figsize=(14,6))
sns.countplot(x='Death Method', data=df_temp, order=df_temp['Death Method'].value_counts().index[:4])
plt.ylabel('Number of Women', weight='bold', fontsize=16)
plt.xlabel('Death Method', weight='bold', fontsize=16)
plt.show()

## **Age of Deceased Women with their cause of Death**

In [None]:
plt.figure(figsize=(14,6))
sns.boxplot(y='Age', x='Death Method', data=df, order=df['Death Method'].value_counts().index[:4], palette="Set2")
plt.ylabel('Age', weight='bold', fontsize=16)
plt.xlabel('Death Method', weight='bold', fontsize=16)
plt.show()

# **Feel free to <span style="color:red"> Upvote </span> and give <span style="color:blue"> Feedback</span>**