In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/superhero-api-dataset/data.csv', index_col='name')

In [None]:
df.describe()

## Check for missing values

In [None]:
def plot_missing_value_cols(df,width=12,height=6):
    """
    
    """
    n_missing_df = df.isna().sum().reset_index(name="n_rows_missing")
    
    plt.figure(figsize=(width,height))
    
    fig = sns.barplot(data = n_missing_df, x="index",y="n_rows_missing")    

    fig.set_xticklabels(
        fig.get_xticklabels(), 
        rotation=45, 
        horizontalalignment='right',
        fontweight='light',
        fontsize='large'   
    )

    None

In [None]:
plot_missing_value_cols(df)

In [None]:
[x for x in df.columns.to_list() if 'biography__aliases' in x]

## Looks like most of the missing values are happening from "biography__aliases__#"
### Removing columns with > 200 missing values

In [None]:
df = df.drop(columns=[column for column in df.columns if (pd.isna(df[column]).sum())>200])

## Check for duplicated heroes

In [None]:
print('duplicated heroes: ')
pd.DataFrame(df.dropna().index).value_counts().where(lambda x:x>1).dropna()

In [None]:
#remove duped heroes by keeping the first record
df = df.groupby(df.index).first()

### Check distribution of powerstats

In [None]:
numeric_df = df.select_dtypes(include=['float64'])
numeric_columns = numeric_df.columns
print(numeric_columns)

In [None]:
plt.figure(figsize=(15,8))
for col in numeric_columns:   
    plt.title(col)
    sns.kdeplot(numeric_df[col], label=col, shade=True)
    plt.legend()
    plt.xlabel('Points')

# Radar Chart for Superheros comparison

In [None]:
def plot_radar_chart(numeric_df,numeric_columns,hero_name1,hero_name2):
    
    categories = numeric_columns.to_list()
    
    #numeric_df = numeric_df.dropna(subset=numeric_columns)
    
    fig = go.Figure()
    
    hero1_series = numeric_df.loc[hero_name1]
    hero2_series = numeric_df.loc[hero_name2]
        
    fig.add_trace(go.Scatterpolar(
      r=hero1_series.values,
      theta=categories,
      fill='toself',
      name=hero1_series.name
    ))
    
    fig.add_trace(go.Scatterpolar(
      r=hero2_series.values,
      theta=categories,
      fill='toself',
      name=hero2_series.name
    ))
    
    fig.update_layout(
      polar=dict(
        radialaxis=dict(
          visible=True,
          range=[0, 110]
        )),
      showlegend=True
    )

    fig.show()
    

### Hulk is Superior to Spider-Man in every way (almost)!

![Hulk is Superior to Spider-Man in every way (almost)](https://i.imgur.com/WPV31Sg.jpg)

In [None]:
plot_radar_chart(numeric_df,numeric_columns,'Spider-Man','Hulk')