# Feature related to Illness in the US

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

import plotly.express as px
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

In [None]:
# Read in dataset
df = pd.read_csv('/kaggle/input/toy-dataset/toy_dataset.csv')
df.head()

In [None]:
df.info() # There is no NaN value

In [None]:
df.columns

# City

In [None]:
df['City'].value_counts()

In [None]:
city = df['City'].value_counts().index.to_list()

def assign_code(city):
    if city == 'New York City':
        return 'NY'
    elif city in ['Los Angeles','Mountain View','San Diego']:
        return 'CA'
    elif city in ['Dallas','Texas']:
        return 'TX'
    elif city == 'Boston':
        return 'MA'
    else:
        return 'Washington D.C.'

df['Location'] = df['City'].apply(lambda city: assign_code(city))
df.head()

In [None]:
byState_count = df['Location'].value_counts().reset_index().rename(columns={'Location':'Count','index':'Location'})

fig = px.choropleth(data_frame = byState_count,
                     locations = 'Location',
                     color = 'Count',
                     locationmode = 'USA-states',
                     scope = 'usa',
                     title = 'Where is the data from?',
                     color_continuous_scale="tropic"
                    )

fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

fig.show()

# Gender

In [None]:
byGender_count = df['Gender'].value_counts().reset_index().rename(columns = {'Gender':'Count','index':'Gender'})

fig = px.pie(data_frame = byGender_count,
             values = 'Count',
             names = 'Gender',
             color_discrete_sequence = px.colors.qualitative.Safe,
             title = 'Distribution of Gender in the Dataset')

fig.update_traces(textposition ='outside',
                  textinfo = 'label+percent',
                  pull = [0.05] * 2)

fig.update_layout(paper_bgcolor = 'cornsilk',
                  legend_title = 'Gender',
                  uniformtext_minsize=18,
                  uniformtext_mode='hide',
                  font = dict(
                      family = "Courier New, monospace",
                      size = 18,
                      color = 'black'
                  ))

fig.show()

## Gender and City

In [None]:
# What Cities are there?
df['City'].value_counts()

In [None]:
gender_city = df.groupby(['Gender','City']).count().reset_index()

fig = px.bar(data_frame = gender_city, 
             x = 'City', y = 'Number',
             color = 'Gender', barmode = 'group',
             text = 'Number', height = 550)

fig.update_layout(title = 'Number of Male and Female in different Cities',
                  yaxis_title = 'Count',
                  legend_title = 'Gender',
                  font = dict(family = "Courier New, monospace",
                              size = 14, color = 'black')
                  )

fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')

fig.update_xaxes(tickangle=45)

fig.show()

## Gender and Illness

In [None]:
# Gender and Illness
gender_illness = df.groupby(['Gender','Illness']).count().reset_index()

fig = px.bar(data_frame = gender_illness, 
             x = 'Gender', y = 'Number',
             color = 'Illness', barmode = 'group',
             text = 'Number', color_discrete_sequence = px.colors.qualitative.Safe)

fig.update_layout(title = 'Number of Male and Female reported Illness',
                  yaxis_title = 'Count',
                  legend_title = 'Illness?',
                  font = dict(family = "Courier New, monospace",
                              size = 14, color = 'black')
                  )

fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')


fig.show()

## City and Illness

In [None]:
# City Illness

illness_city = df.groupby(['Illness','City']).count().reset_index()


fig = px.bar(data_frame = illness_city, 
             x = 'City', y = 'Number',
             color = 'Illness', barmode = 'group',
             text = 'Number',
             color_discrete_sequence = px.colors.qualitative.Set2, height = 550)

fig.update_layout(title = 'Illness in different Cities',
                  yaxis_title = 'Count',
                  legend_title = 'Illness?',
                  font = dict(family = "Courier New, monospace",
                              size = 14, color = 'black')
                  )

fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')


fig.update_xaxes(tickangle=45)

fig.show()

# Age

In [None]:
fig, ax = plt.subplots(dpi=120)

fig = sns.countplot(data = df, x = 'Age', hue = 'City',
              order = df['Age'].value_counts().iloc[:10].index,
              palette = 'Accent')
fig.set_title("Top Age Group in the Dataset\nseperated by City")
plt.legend(loc = 'center right',bbox_to_anchor = (1.40,0.5), borderaxespad = 0)
plt.show()

In [None]:
# Age and Illness 
age_illness = df.groupby(['Age','Illness']).count().reset_index()
age_illness.head()

In [None]:
# Age Group and Illness
fig = px.bar(data_frame = age_illness,
             x = 'Age', y = 'Number',color = 'Illness')

fig.update_layout(title = 'Age Group and Illness',
                  yaxis_title = 'Count',
                  legend_title = 'Illness?',
                  font = dict(family = "Courier New, monospace",
                              size = 14, color = 'black')
                  )

fig.show()

In [None]:
# Before Turning 30
fig = px.bar(data_frame = age_illness[age_illness['Age'] <= 30],
             x = 'Age', y = 'Number', color = 'Illness',
             barmode = 'group', text = 'Number', height = 550)

fig.update_layout(title = 'Age Group (before 30) and Illness',
                  yaxis_title = 'Count',
                  legend_title = 'Illness?',
                  font = dict(family = "Courier New, monospace",
                              size = 14, color = 'black')
                  )

fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')

fig.show()

In [None]:
# Between 45 and 60
fig = px.bar(data_frame = age_illness[(age_illness['Age'] <= 60) & (age_illness['Age'] >= 45)],
             x = 'Age', y = 'Number', color = 'Illness',
             barmode = 'group', text = 'Number', height = 550)

fig.update_layout(title = 'Age Group (45-60) and Illness',
                  yaxis_title = 'Count',
                  legend_title = 'Illness?',
                  font = dict(family = "Courier New, monospace",
                              size = 14, color = 'black')
                  )

fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')

fig.show()

# Average Income

In [None]:
# Mean Income
df['Income'].mean()

In [None]:
# Average Income between Male and Female

print("Female average income:",df[df['Gender'] == 'Female']['Income'].mean())
print("Male average income:",df[df['Gender'] == 'Male']['Income'].mean())

In [None]:
# Creating new Datframe for average income by cities
avr_income_cities = df.groupby('City').mean().reset_index()

def define_status(income):
    if income > avr_income_cities['Income'].quantile(0.75):
        return 'green'
    elif avr_income_cities['Income'].quantile(0.75) > income >= avr_income_cities['Income'].quantile(0.5):
        return 'lightgreen'
    elif avr_income_cities['Income'].quantile(0.5) > income >= avr_income_cities['Income'].quantile(0.25):
        return 'orange'
    else:
        return 'red'

avr_income_cities['Color Coded'] = avr_income_cities['Income'].apply(lambda income: define_status(income))
avr_income_cities.head()

## City and Average Income

In [None]:
# City with the average income

fig = px.bar(data_frame = avr_income_cities,
             x = 'City', y = 'Income', text = 'Income',
             height = 550)

fig.update_layout(title = 'Average Income in different Cities',
                  yaxis_title = 'Income (US Dollar)',
                  legend_title = 'Company Name',
                  font = dict(family = "Courier New, monospace",
                              size = 14, color = 'black')
                  )

fig.update_traces(texttemplate='%{text:.2s}', textposition='outside',
                  marker_color = avr_income_cities['Color Coded'].to_list()
                  )

fig.update_xaxes(tickangle=45)

fig.show()

For the Bar Chart above, I couldn't figure out a way to put a legend to indicate that does the color means. So, I will put the description here:
* Green: Above 75% of Average Income of all the cities
* Lightgreen: Below 75% and Above 50% ...
* Yellow: Below 50% and Above 25% ... 
* Red: Below 25% ... 

If someone knows how to can improve this, please let me know in the comment section. Thank you in advance!

## Gender and Average Income

In [None]:
# Male and Female Earning in Different City

fig = px.bar(data_frame = df.groupby(['Gender','City']).mean().reset_index(),
             x = 'Gender', y = 'Income', color = 'City',
             barmode = 'group', text = 'Income', height = 550)

fig.update_layout(title = 'Male and Female Average Income in different Cities',
                  yaxis_title = 'Income (US Dollar)',
                  legend_title = 'Company Name',
                  font = dict(family = "Courier New, monospace",
                              size = 14, color = 'black')
                  )

fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')

fig.show()

# Income Percentile

In [None]:
df2 = df.copy()

def status(income):
    if income <= 80867.750000:
        return 'Below 25'
    elif 80867.750000 < income <= 93655.000000:
        return 'Between 25 and 50'
    elif 93655.000000 < income <= 104519.000000:
        return 'Between 50 and 75'
    else:
        return 'Above 75'

df2['Income Status'] = df2['Income'].apply(lambda income: status(income))
df2.head()

## Gender and Income in Percentile

In [None]:
fig = px.bar(data_frame = df2.groupby(['Gender','Income Status']).count().reset_index(),
             x = 'Income Status', y = 'Number', color = 'Gender',
             barmode = 'group', text = 'Income', height = 550,
             color_discrete_sequence = px.colors.qualitative.Vivid,
             category_orders = {"Income Status": ['Below 25','Between 25 and 50','Between 50 and 75','Above 75']})

fig.update_layout(title = 'Male and Female Income in different Percentile',
                  yaxis_title = 'Count',
                  legend_title = 'Gender',
                  font = dict(family = "Courier New, monospace",
                              size = 14, color = 'black')
                  )

fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')

fig.show()

## Cities and Income in Percentile

In [None]:
fig = px.bar(data_frame = df2.groupby(['City','Income Status']).count().reset_index(),
             x = 'Income Status', y = 'Number', color = 'City',
             barmode = 'group', text = 'Income', height = 550,
             color_discrete_sequence = px.colors.qualitative.Pastel,
             category_orders = {"Income Status": ['Below 25','Between 25 and 50','Between 50 and 75','Above 75']})

fig.update_layout(title = 'Different Cities Income in different Percentile',
                  yaxis_title = 'Count',
                  legend_title = 'City',
                  font = dict(family = "Courier New, monospace",
                              size = 14, color = 'black')
                  )

fig.update_traces(texttemplate='%{text:.2s}',textposition='outside')

fig.show()

## Illness and Income in different Percentile

In [None]:
fig = px.bar(data_frame = df2.groupby(['Illness','Income Status']).count().reset_index(),
             x = 'Income Status', y = 'Number', color = 'Illness',
             barmode = 'group', text = 'Income', height = 550,
             color_discrete_sequence = px.colors.qualitative.Pastel,
             category_orders = {"Income Status": ['Below 25','Between 25 and 50','Between 50 and 75','Above 75']})

fig.update_layout(title = 'Illness and Income in different Percentile',
                  yaxis_title = 'Count',
                  legend_title = 'Illness?',
                  font = dict(family = "Courier New, monospace",
                              size = 14, color = 'black')
                  )

fig.update_traces(texttemplate='%{text:.2s}',textposition='outside')

fig.show()

# Thank you for checking out my work!