In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

This EDA is divided into following sections:

1. [**Overview**](#Section_1)
1. [**Data Clean up**](#Section_2)
1. [**Feature Engineering** ](#Section_4)
1. [**Data Exploration**](#Section_3)


<a id='Section_1'></a>
# Overview

In [None]:
df = pd.read_csv('../input/7k-indian-famous-women/data.csv')
df.head(10)

In [None]:
df.shape

In [None]:
df.describe()

Using above summary we can observe:
1. We have number of famous personalities with same name. This can be a coincidence or redundant data, we will explore it in Data Cleanup process.
1. We have huge number of Job and Descriptions related to job, to efficiently visualize those we will have to futher process and cleanup data.
1. We do not have birth dates for all the famous personalities.


In [None]:
df.dtypes

As we can see Birth Date and Death Date both are incorrectly casted as object type, we will cleanup and convert those dates in appropriate format in data cleanup section. 

<a id='Section_2'></a>
# Data Cleanup

In [None]:
print('Percentage of Blank across each column:\n',100*(df.isna().sum()/len(df)))

The above summary shows us percentage of null values in each column in our data:
1. Education Place
1. Native Language
1. Father
1. Mother
1. Spouse
1. Death Place
1. Death Method
1. Death Reason
All these columns have majority of values as Null and it practically makes no sense to use these for analysis or impute data.

To move further I will drop all rows which have Birth Date as Nulls.

In [None]:
df = df.loc[~df['Birth Date'].isna(),]

Based on above summary table we can clearly observe we have redundant names, we will need investigate if we have duplicate entires in the data or we actually have common names for famouns personalities.

In [None]:
df['Full Name'].value_counts()

In [None]:
print("Unique Values per column:\n",df.nunique())
print("\n Number of Rows in Data: %d \n"%(df.shape[0]))
print("Number of Duplicated rows in Data %d"%(df.duplicated(keep='first').sum()))

As we can see data provided has total 7685 rows where as we do have 11 redundant rows. We will drop those rows in next section. And investigate further if we have any other redundancies.

In [None]:
df1 = df.copy()
df = df.drop_duplicates()
print(df.shape[0])

There might be chances where we would get same name and same profession but some other fields might have missing values, and such cases would have been missed by the drop_duplicates

In [None]:
df[df.duplicated(subset=['Full Name','Job'], keep=False)].sort_values('Full Name')

In [None]:
def clean_date(data,date):
    temp = data[date].str.split('-', expand=True)
    temp['original_Date']=data[date]
    
    if len(temp.columns.to_list())==4:
         temp.columns = ['Year','Month','Date','date']
    if len(temp.columns.to_list())==5:
        temp.columns = ['Year','Month','Date','x','date']
    temp['Month'] = temp['Month'].astype('int64')
    temp['Date'] = temp['Date'].astype('int64')
    dates_to_include = temp.loc[(temp['Month'].isin(range(1,12))) | (temp['Date'].isin(range(1,31))),'date']
    return(dates_to_include)

In [None]:
x = clean_date(df,'Birth Date')
df=df.loc[df['Birth Date'].isin(list(clean_date(df,'Birth Date'))),]
df['Birth Date']=pd.to_datetime(df['Birth Date'],format='%Y-%m-%d',errors='coerce')
x = clean_date(df.loc[df['Death Date'].isna()==False,],'Death Date')
df=df.loc[(df['Death Date'].isin(list(x)) | df['Death Date'].isna()),]
df['Death Date']=pd.to_datetime(df['Death Date'],format='%Y-%m-%d',errors='coerce')

<a id='Section_4'></a>
# Feature Engineering 

Now we have cleaned Birth and Death dates we can calculate Age.
In case of Death date is blank I have used current date to calculate age.

In [None]:
df['Age'] = np.where(df['Death Date'].isna(),datetime.datetime.now().date()-df['Birth Date'].dt.date, df['Death Date'].dt.date - df['Birth Date'].dt.date)
df['Age']=df['Age']/np.timedelta64(1, 'Y')

We will also create a feature based on Death Date, if the person is Alive or Dead.

In [None]:
df['Alive'] = np.where(df['Death Date'].isna(),'Yes','No')
df['Alive'].value_counts()

In [None]:
print(len(df['Description'].unique()))
print(len(df['Job'].unique()))

We can bucket different Jobs into slightly generic categories which will help us visualise and generalize data in a better way.

In [None]:
industry_map = {'Art':['Poet', 
'Model','Writer', 
'Actor', 'Comedian','Art Historian',
'Stage Actor','Film Director','Printmaker', 
'Film Actor','Author','Musician', 'Singer', 'Television Actor','Choreographer', 'Voice Actor','Film Producer','Dancer', 'Sculptor','Composer', 'Screenwriter','Playback Singer','Documentary Filmmaker', 'Fashion Designer', 'Songwriter', 'Comedy Writer', 
       "Children'S Writer",'Filmmaker','Character Actor','Artist', 'Television Producer', 'Cinematographer',
       'Director','Photographer', 'Lyricist', 'Film Critic','Theater Director', 'Painter','Wardrobe Stylist', 'Film Editor', 'Designer','Musicologist','Biographer',
       'Singer-Songwriter','Flautist','Violinist','Costume Designer','Drummer','Textile Artist','Magician','Interior Designer','Youtuber',  'Internet Celebrity',
          'Speaker','Puppeteer',     'Fashion Model','Pornographic Actor', 'Celebrity', 'Stand-Up Comedian','Entertainer','Short Story Writer','Cartoonist' 'Folklorist',
       'String Musician', 'Vj','Music Director','Art Critic','Percussionist','Socialite''Motivational Speaker','Make-Up Artist','Personal Stylist', 'Playwright','Impressionist','Autobiographer','Critic'],
'Sport':['Cricket Umpire','Figure Skater','Tennis Player','Boxer', 'Badminton Player',  'Alpine Skier',
       'Chess Player','Archer','Cricketer','Triathlete', 'Sport Shooter','Martial Artist','Judoka', 'Field Hockey Player','Squash Player','Swimmer','Table Tennis Player', 'Weightlifter', 'Netballer','Sprinter','Amateur Wrestler','Artistic Gymnast','Association Football Player','Coach','Taekwondo Athlete','Kabaddi Player','Athlete','Golfer','Motorcycle Racer','Racewalker',
       'Basketball Player','Basketball Coach','Middle-Distance Runner','Rower','Sport Cyclist','Player','Kickboxer','Bodybuilder','Discus Thrower','Volleyball Player','Racing Automobile Driver','Karateka', 'Handball Player','Rhythmic Gymnast','Triple Jumper','Long Jumper', 'Skydiver',
    'Hurdler','Long-Distance Runner','Athletics Competitor', 'Mountaineer','Snooker Player','Racing Driver', 'Heptathlete','Fencer'],
'Politics':['Politician','Political Activist' ],
'Media':['Editor','Journalist','News Presenter','Columnist','Television Presenter','Presenter','Impresario'],
'Activist':['Conservationist','Cultural Activist','Lgbtiq+ Rights Activist','Revolutionary','Human Rights Activist','Activist', 'Public Figure', 
        'Disability Rights Activist', 'Climate Activist','Environmentalist'],
'Science':['Particle Physicist','Lichenologist','Science Writer','Aerospace Engineer','Astrophysicist','Marine Biologist','Researcher','Landscape Architect',  'Entomologist','Zoologist','Anthropologist','Engineer', 'Architect','Scientist','Physicist','Chemist','Botanist'],
'Academic':['Essayist','Literary Scholar','Philosopher','Academic','Translator','Novelist','Linguist'],
'Medical':['Oncologist', 'Immunologist','Biophysicist', 'Biophysics','Nurse','Cardiologist','Pediatrician','Biotechnologist','Medical Researcher','Hematologist', 'Pathologist','Ophthalmologist',  'Epidemiologist',
       'Gastroenterologist','Microbiologist','Surgeon','Cell Biologist','Dietitian', 'Psychologist','Gynaecologist','Virologist','Physician','Biochemist','Biologist','Physiologist','Clinical Psychologist'],
'Food':['Food Critic','Farmer','Brewmaster','Neuroscientist','Cook','Chef','Horticulturist'],
'Business':['Social Entrepreneur','Executive','Chief Executive Officer', 'Trade Unionist','Enterprise Resource Planning','Banker','Industrialist','Businessperson','Organizational Founder','Entrepreneur','Business Executive'],
'Transport':['Train Driver','Travel Writer','Explorer','Aircraft Pilot','Flight Attendant','Sailor'],
'Beauty':['Supermodel','Hairdresser','Beauty Pageant Contestant'],
'Social Worker':[ 'Philanthropist','Sociologist','Humanitarian','Social Worker','Social Service','Freedom Fighter','Social Activist', 'Literary Critic'],
'Relegion':['Mystic', 'Priest','Bhikkhuni','Imam','Nun''Aryika','Religious Sister'],
'Education':['Adjunct Professor','Pedagogue', 'Head Teacher', 'Educator', 'Assistant Professor','Lecturer','Professor','Yoga Instructor', 'Sanskrit Scholar','Guru','Teacher','Theologian','Ecologist','Music Pedagogue', 'Yogi','University Teacher','Mathematician','Specialized Educator','Full Professor','Historian','Economist'],
'Military':['Military Officer',  'Military Leader', 'Soldier','Military Personnel','Peshmerga'],
'Government':['Attorney At Law', 'Jurist','Officer','Spokesperson', 'Monarch', 'Civil Servant','Judge','Diplomat', 'Police Officer','Lawyer','Magistrate','Advocate'],
'Other':['Ama De Casa','Feminist','Princess','Textile Designer','Student','Pretender','Mnemonist','Potter'],
'IT':['Software Engineer','Graphic Designer','Animator','Programmer','Computer Scientist']
}

In [None]:
for j in df['Job']:
    for k,v in industry_map.items():
        if j in v:
            df.loc[df['Job']==j,'Industry']=k
df.loc[df['Industry'].isna(),'Industry']='Unknown'

<a id='Section_3'></a>
# Data Exploration 

In [None]:
print("Youngest Famous Women in our dataset: %s with current age %d\n"%(df.loc[df['Age']==min(df['Age']),'Full Name'].str.cat(sep='\n'),min(df['Age'])))

print("Oldest Famous Women in our dataset: %s with current age %d\n"%(df.loc[df['Age']==max(df['Age']),'Full Name'].str.cat(sep='\n'),max(df['Age'])))

Distribution By Country

In [None]:

sns.set(rc={'figure.figsize':(19,8)})
sns.countplot(x='Country',data=df)
plt.xticks(rotation=45)
plt.title("Famous Women By Countries")
plt.show()
print("Top 5 Countries")
print(df['Country'].value_counts()[:5])

As dataset suggest majority of famous personalities are from India few from USA and UK.
We do have Country entry as "British India" I will be replacing it with India as women born Pre Independence have British India as their country.

In [None]:
df.loc[df['Country']=='British India','Country']='India'

Age Distributions By Industry

In [None]:
sns.set(rc={'figure.figsize':(19,8)})
fig, (ax1, ax2) = plt.subplots(ncols=2)
sns.boxplot(x='Industry',y='Age',data=df.loc[df['Alive']!='No',],ax=ax1)
sns.barplot(x='Industry',y='Age',data=df.loc[df['Alive']!='No',],ax=ax2)
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=40, ha="right")
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.suptitle("Age Distribution By Industry (Alive)", size=20)
plt.subplots_adjust(top=0.88)
plt.show()

In [None]:
sns.set(rc={'figure.figsize':(19,8)})
fig, (ax1, ax2) = plt.subplots(ncols=2)

sns.boxplot(x='Industry',y='Age',data=df.loc[df['Alive']=='No',],ax=ax1)
sns.barplot(x='Industry',y='Age',data=df.loc[df['Alive']=='No',],ax=ax2)
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=40, ha="right")
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.suptitle("Age Distribution By Industry (Deceased)", size=20)
plt.subplots_adjust(top=0.88)
plt.show()

Using above chart we can clearly observe that:
1. Women belonging from Military Industry have least Meadian Age, which is quite obvious.
1. Surprisingly, women associated with Social work, Academics and Medical Industry have one of the highest Median Age.
1. IT industry is missing from the chart as IT is one of the very recent industry booming in India. Our Dataset consists of women from IT who are still Alive.

In [None]:
sns.set(rc={'figure.figsize':(25,8)})
fig, (ax3, ax1, ax2) = plt.subplots(ncols=3)


sns.boxplot(x='Death Method',y='Age',data=df.loc[df['Alive']=='No',],ax=ax1)
sns.barplot(x='Death Method',y='Age',data=df.loc[df['Alive']=='No',],ax=ax2)
sns.countplot(x='Death Method',data=df.loc[df['Alive']=='No',],ax=ax3)

ax1.set_xticklabels(ax1.get_xticklabels(), rotation=40, ha="right")
ax1.set_title('Age Distribution By Death Method')
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=40, ha="right")
ax2.set_title('Age Distribution By Death Method')
ax3.set_xticklabels(ax3.get_xticklabels(), rotation=40, ha="right")
ax3.set_title('Count By Death Method')

plt.tight_layout()
plt.suptitle("Age Distribution By Death Method (Deceased)", size=20)
plt.subplots_adjust(top=0.88)
plt.show()
print("Percentage of Deaths By Death Methods (Excludes Blank Entries)")
print(100*(df.loc[df['Alive']=='No','Death Method'].value_counts()/df.loc[df['Alive']=='No','Death Method'].count()))
print("\n Median Age:\n")
print(df.loc[df['Alive']=='No',].groupby(['Death Method'])['Age'].median())

1. Majority of deaths are due to Natural Causes. Median Age expectancy seems to be around **63.4** years.
1. Suicide is the 2nd Highest Death method with lowest Median Age which is **30** years.


**To Be continued**