In [None]:
#Following https://jovian.ai/aakashns/python-sklearn-linear-regression
medical_charges_url = 'https://raw.githubusercontent.com/JovianML/opendatasets/master/data/medical-charges.csv'

In [None]:
from urllib.request import urlretrieve


In [None]:
urlretrieve(medical_charges_url, 'medical.csv')


In [None]:
!pip install pandas --quiet

In [None]:
import pandas as pd

In [None]:
df_medical = pd.read_csv('medical.csv')

In [None]:
df_medical


In [None]:
df_medical.info()

In [None]:
df_medical.describe()

In [None]:
%pip install plotly matplotlib seaborn --quiet


In [None]:
import plotly.express as px
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10, 6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'

In [None]:
df_medical.age.describe()

In [None]:
fig = px.histogram(df_medical, 
                   x='age', 
                   marginal='box', 
                   nbins=47, 
                   title='Distribution of Age')
fig.update_layout(bargap=0.1)
fig.show(renderer="notebook")
fig.write_html("plot.html")

In [None]:
df_medical[df_medical['age'].isin([18,19])].charges.describe()


In [None]:
df_medical[df_medical['age'].isin([20,26])].charges.describe()
#Referring to plot.html histogram
#There are twice as many 18 and 19 y/o people not due to low charges because the mean is similar for a bit older people.
#Perhaps the reason is that you can get the insurance from 18 so there's such an influx of very young adults.

In [None]:
fig = px.histogram(df_medical, 
                   x='bmi', 
                   marginal='box', 
                   color_discrete_sequence=['red'], 
                   title='Distribution of BMI (Body Mass Index)')
fig.update_layout(bargap=0.1)
fig.show()
fig.write_html("plot_bmi.html")

In [None]:
fig = px.histogram(df_medical,
                  x="charges",
                  marginal = 'box',
                color = 'smoker',
                color_discrete_sequence = ['green', 'gray'],
                 title = 'Annual Medical Charges')
fig.update_layout(bargap=0.1)
fig.show()
fig.write_html("plot_charges_smoker.html")

In [None]:
fig = px.histogram(df_medical,
                  x="charges",
                  marginal = 'box',
                color = 'sex',
                color_discrete_sequence = ['green', 'gray'],
                 title = 'Annual Medical Charges')
fig.update_layout(bargap=0.1)
fig.show()
fig.write_html("plot_charges_sex.html")

In [None]:
fig = px.histogram(df_medical,
                  x="charges",
                  marginal = 'box',
                color = 'region',
                color_discrete_sequence = ['green', 'gray', 'blue', 'purple'],
                 title = 'Annual Medical Charges')
fig.update_layout(bargap=0.1)
fig.show()
fig.write_html("plot_charges_region.html")

In [None]:
df_medical.smoker.value_counts()
#this matches the nation average in 2010

In [None]:
fig = px.histogram(df_medical,
                  x='smoker',
                  color='sex',
                  color_discrete_sequence = ['green', 'gray'],
                  title='Smoker_sex')
fig.show()
fig.write_html("smoker_sex.html")
#Male are more likely to be smokers(which also matches the general population)

In [None]:
fig = px.histogram(df_medical,
                  x="region",
                  title="Region distribution")
fig.show()
fig.write_html("Region_distribution.html")

fig = px.histogram(df_medical,
                  x="sex",
                  title="Sex distribution")
fig.show()
fig.write_html("Sex_distribution.html")

fig = px.histogram(df_medical,
                  x="children",
                  title="Children distribution")
fig.show()
fig.write_html("Children_distribution.html")

In [None]:
fig = px.scatter(df_medical,
                x='age',
                y='charges',
                color='smoker',
                opacity=0.8,
                hover_data=['sex'],
                title='Age vs. Charges')
fig.update_traces(marker_size=5)
fig.show()
fig.write_html("scatter_age.html")

In [None]:
df_medical[(df_medical['smoker']=='yes') & (df_medical['charges']>30000)].bmi.describe()
#Probably the cluster of smokers in the scatter plot with highest medical charges are people who smoke and have high bmi

In [None]:
fig = px.scatter(df_medical,
                x='bmi',
                y='charges',
                color = 'smoker',
                opacity=0.8, 
                hover_data=['sex'], 
                title='BMI vs. Charges')
fig.update_traces(marker_size=5)
fig.show()
fig.write_html("scatter_bmi.html")