# Module Imports

In [2]:
import numpy as np
import pandas as pd

# Read data

In [3]:
df = pd.read_csv("superhero-set/heroes_information.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,name,Gender,Eye color,Race,Hair color,Height,Publisher,Skin color,Alignment,Weight
0,0,A-Bomb,Male,yellow,Human,No Hair,203.0,Marvel Comics,-,good,441.0
1,1,Abe Sapien,Male,blue,Icthyo Sapien,No Hair,191.0,Dark Horse Comics,blue,good,65.0
2,2,Abin Sur,Male,blue,Ungaran,No Hair,185.0,DC Comics,red,good,90.0
3,3,Abomination,Male,green,Human / Radiation,No Hair,203.0,Marvel Comics,-,bad,441.0
4,4,Abraxas,Male,blue,Cosmic Entity,Black,-99.0,Marvel Comics,-,bad,-99.0


# Dropping use less Unnamed column

In [4]:
df.drop('Unnamed: 0', axis=1, inplace=True)

# Data cleaning

In [5]:
# Check data type of height and weight
df[['Height', 'Weight']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 734 entries, 0 to 733
Data columns (total 2 columns):
Height    734 non-null float64
Weight    732 non-null float64
dtypes: float64(2)
memory usage: 11.5 KB


In [6]:
# In Height there are no missing values, as it has 734 float values
# In weight there are two missing values as it has 732 non-null float values
df[df['Weight'].isna()]

Unnamed: 0,name,Gender,Eye color,Race,Hair color,Height,Publisher,Skin color,Alignment,Weight
286,Godzilla,-,-,Kaiju,-,108.0,,grey,bad,
389,King Kong,Male,yellow,Animal,Black,30.5,,-,good,


In [7]:
# replaced Nan with -99
df.loc[df['Weight'].isna(), 'Weight'] = -99

df.iloc[[286,389],:]

Unnamed: 0,name,Gender,Eye color,Race,Hair color,Height,Publisher,Skin color,Alignment,Weight
286,Godzilla,-,-,Kaiju,-,108.0,,grey,bad,-99.0
389,King Kong,Male,yellow,Animal,Black,30.5,,-,good,-99.0


In [8]:
# replace -99, other missing values with median value
df['Height'].replace(-99.0, np.median(df['Height']), inplace= True)

In [9]:
df['Weight'].replace(-99.0, np.median(df['Weight']), inplace=True)

# Calculating BMI

# $BMI = \frac{Weight_{kg}}{Height_{m}^2}$

In [10]:
# Domain knowledge guess 
# Height is in Centi-Meters
# Converting Height to Meters

df['BMI'] = np.divide(df['Weight'] , np.square(df['Height']/100))

In [11]:
fit_heroes = df[(df['BMI'] < 30) & (df['BMI']>20)]
fit_heroes.sort_values(by='BMI', ascending=False)[['name','BMI']].reset_index().drop('index',axis=1).head()

Unnamed: 0,name,BMI
0,Apocalypse,29.756001
1,Captain Britain,29.588817
2,Brother Voodoo,29.561946
3,Cottonmouth,29.561946
4,Maverick,29.530994


In [14]:
obese_heroes = df.loc[( df['BMI'] > 30) & (df['Alignment'] == 'good'), ['name','BMI']]

In [15]:
obese_heroes.head()

Unnamed: 0,name,BMI
0,A-Bomb,107.015458
36,Aqualad,33.455372
37,Aquaman,42.658875
42,Ares,78.889701
47,Atlas,30.159157


In [16]:
# top 5 obese heroes

obese_heroes.sort_values(by='BMI', ascending=False).head(5)

Unnamed: 0,name,BMI
389,King Kong,666.487503
422,Machine Man,114.365911
0,A-Bomb,107.015458
331,Hulk,105.818328
575,Sasquatch,96.748186


In [20]:
over_weight_heroes= df.loc[( df['BMI'] > 25) & ( df['BMI'] <= 30) & (df['Alignment'] == 'good'), ['name','BMI']]

In [18]:
over_weight_heroes.sort_values(by='BMI', ascending=False).head(5)

Unnamed: 0,name,BMI
150,Captain Britain,29.588817
137,Brother Voodoo,29.561946
133,Box III,29.530994
438,Maverick,29.530994
663,Thunderbird,29.510592


In [21]:
# Normal weight
Normal_weight_heroes = df.loc[( df['BMI'] > 18) & ( df['BMI'] <= 25) & (df['Alignment'] == 'good'), ['name','BMI']]

In [22]:
Normal_weight_heroes.sort_values(by='BMI', ascending=False).head(5)

Unnamed: 0,name,BMI
525,Power Girl,25.0
262,Flash,25.0
342,Ink,25.0
473,Morph,24.93372
589,Shang-Chi,24.93372


In [23]:
# under weight heroes
underweight_heroes = df.loc[( df['BMI'] <= 18) & (df['Alignment'] == 'good'), ['name','BMI']]

In [24]:
underweight_heroes.sort_values(by='BMI', ascending=False).head()

Unnamed: 0,name,BMI
370,Jolt,17.998163
524,Polaris,17.99308
1,Abe Sapien,17.817494
190,Crystal,17.71542
588,Shadowcat,17.71542
