In [None]:
import numpy as np  
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn') 
sns.set(font_scale=2.5)

import missingno as msno 

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline 


In [None]:
df_train = pd.read_csv('../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')
df_test = pd.read_csv('../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')

In [None]:
df_train.head(50)

In [None]:
df_train.shape

**사망자수 파악**

In [None]:
df_train['DEATH_EVENT'].value_counts()

**null data 확인**

In [None]:
for col in df_train.columns: #각 col에 null data가 몇개 있는지 % 보기 위함
    msg='column: {:>30}\t Percent of NaN value: {:.2f}%'.format(col, 100*(df_train[col].isnull().sum() / df_train[col].shape[0]))
    print(msg)

In [None]:
df_train.describe()

In [None]:
df_test.describe()

In [None]:
msno.bar(df=df_train.iloc[:, :], figsize=(8,5), color=(0.8, 0.5, 0.2))

**연령 범주화시키기**

In [None]:
df_train['age'].min()

In [None]:
df_train['age'].max()

In [None]:
df_train['age_band']=0

In [None]:
df_train.loc[(df_train['age'] >= 40) & (df_train['age'] < 50), 'age_band'] = '40~49'
df_train.loc[(df_train['age'] >= 50) & (df_train['age'] < 60), 'age_band'] = '50~59'
df_train.loc[(df_train['age'] >= 60) & (df_train['age'] < 70), 'age_band'] = '60~69'
df_train.loc[(df_train['age'] >= 70) & (df_train['age'] < 80), 'age_band'] = '70~79'
df_train.loc[(df_train['age'] >= 80) & (df_train['age'] < 90), 'age_band'] = '80~89'
df_train.loc[(df_train['age'] >= 90) & (df_train['age'] < 100), 'age_band'] = '90~100'


In [None]:
df_train.head()

In [None]:
df_train.age_band.value_counts().to_frame().style.background_gradient(cmap='summer_r') #표본의 연령대

In [None]:
pd.crosstab(df_train['age_band'], df_train['high_blood_pressure'], margins=True).style.background_gradient(cmap='winter') #margins: All 여부

In [None]:
f, ax= plt.subplots(1, 2, figsize=(25,5))
# plt.figure(1,2,figsize=(20,5))
# f=plt.figure(figsize=(20,5)) 
df_train[['age_band', 'high_blood_pressure']].groupby(['age_band'],as_index=True).mean().plot.bar(color=['#FFDF00'],ax=ax[0])
ax[0].set_title('high_blood_pressure vs age')
sns.countplot('age_band',hue='DEATH_EVENT',data=df_train, ax=ax[1])
ax[1].set_title('age: Survived vs Dead')
plt.subplots_adjust(wspace=0.2, hspace=0.8)
plt.show()


In [None]:
sns.factorplot('smoking', 'DEATH_EVENT', hue='age_band',data=df_train,size=5,aspect=1.5) 
#hue를 지정해서 분류별로 볼 수 있음. hue='구분값'

In [None]:
plt.figure(figsize = [18, 6])
plt.subplot(121) #row / col / plot number
sns.countplot( x= 'smoking', data = df_train)
plt.subplot(122)
sns.countplot(x='smoking', hue = 'DEATH_EVENT', data = df_train)

In [None]:
f,ax=plt.subplots(2, 2, figsize=(20,15))

sns.histplot(x='age', data=df_train, hue='DEATH_EVENT', kde=True, ax=ax[0,0])
ax[0,0].set_title('(1) Distribution by age ')

sns.histplot(data=df_train, x='ejection_fraction', hue='DEATH_EVENT', kde=True, ax=ax[0,1]) 
ax[0,1].set_title('(2) Distribution by ejection_fraction')

sns.histplot(data=df_train, x='platelets', hue='DEATH_EVENT', kde=True, ax=ax[1,0])
ax[1,0].set_title('(3) Distribution by platelets')

plt.subplots_adjust(wspace=0.5, hspace=0.5)

plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.violinplot(data=df_train, x='DEATH_EVENT', y='ejection_fraction', hue='smoking', scale='count', split=True)
plt.legend(bbox_to_anchor=(0.85, 0.8, 0.25, 0.25), loc='upper right')


In [None]:
f, ax = plt.subplots(1, 2, figsize=(18, 8))

sns.histplot(data=df_train.loc[df_train['ejection_fraction']<50,'ejection_fraction'], bins=12, ax=ax[0])
sns.histplot(data=df_train.loc[df_train['ejection_fraction']>=50,'ejection_fraction'], bins=12, ax=ax[1])

In [None]:
#f, ax = plt.subplots(1, 1, figsize=(7, 7))
#df_train[['diabetes', 'ejection_fraction']].groupby(['diabetes'], as_index=True).sort(by='ejection_fraction', ascending=True).plot.bar(ax=ax)

In [None]:
sns.relplot(data=df_train, x="age", y="platelets", hue="DEATH_EVENT", kind="line", height=6, aspect=4)

In [None]:
sns.relplot(data=df_train, x="serum_sodium", y="smoking", hue="DEATH_EVENT", kind="scatter", height=6, aspect=4)

In [None]:
df_train.head(50)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 8))
g = sns.distplot(df_train['platelets'], color='#388E3C', label='Skewness : {:.2f}'.format(df_train['platelets'].skew()), ax=ax)
plt.legend(loc = 2, bbox_to_anchor = (1,1))

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 8))
g = sns.distplot(df_train['creatinine_phosphokinase'], color='#388E3C', label='Skewness : {:.2f}'.format(df_train['creatinine_phosphokinase'].skew()), ax=ax)
plt.legend(loc = 1, bbox_to_anchor = (1,1))

#high skewness

In [None]:
df_train['creatinine_phosphokinase'] = df_train['creatinine_phosphokinase'].map(lambda i: np.log(i) if i > 0 else 0)
#df_train['creatinine_phosphokinase'] = map(lambda i: np.log(i) if i > 0 else 0, df_train.iloc[:, 2])


In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 8))
g = sns.distplot(df_train['creatinine_phosphokinase'], color='#388E3C', label='Skewness : {:.2f}'.format(df_train['creatinine_phosphokinase'].skew()), ax=ax)
plt.legend(loc = 2, bbox_to_anchor = (1,1))
#bbox_to_anchor(x, y, width, height)

In [None]:
plt.subplot(221)
sns.boxplot(x='DEATH_EVENT', y='ejection_fraction',data=df_train)

plt.subplot(222)
sns.boxplot(x='DEATH_EVENT', y='creatinine_phosphokinase',data=df_train)

plt.subplot(223)
sns.boxplot(x='DEATH_EVENT', y='platelets',data=df_train)

plt.subplot(224)
sns.boxplot(x='DEATH_EVENT', y='serum_sodium',data=df_train)

plt.tight_layout()
plt.rcParams["figure.figsize"] = (20,20)

plt.show()

# 7/24~7/30

In [None]:
pip install plotly_express

In [None]:
import plotly.express as px
import pandas as pd

In [None]:
df_train = pd.read_csv('../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')
df_train.head(10)

In [None]:
is_anaemia = df_train['anaemia']== 1
dead = df_train['DEATH_EVENT'] == 1
subset_1 = df_train[is_anaemia & dead]

is_diabetes = df_train['diabetes']== 1
subset_2 = df_train[is_diabetes & dead]
count_diabetes_dead = len(subset_2)

is_high_blood_pressure = df_train['high_blood_pressure']== 1
subset_3 = df_train[is_high_blood_pressure & dead]

is_smoking = df_train['smoking']== 1
subset_4= df_train[is_smoking & dead]


In [None]:
data = dict(
    number=[len(subset_1), len(subset_2), len(subset_3), len(subset_4)],
    stage=["is_anaemia", "is_diabetes", "is_high_blood_pressure", "is_smoking"])
fig = px.funnel(data, x='number', y='stage')
fig.show()

In [None]:
is_man = df_train['sex'] == 1
is_woman = df_train['sex'] == 0

is_anaemia = df_train['anaemia']== 1
dead = df_train['DEATH_EVENT'] == 1
subset_1_1 = df_train[is_anaemia & dead & is_man]
subset_1_2 = df_train[is_anaemia & dead & is_woman]

is_diabetes = df_train['diabetes']== 1
subset_2_1 = df_train[is_diabetes & dead & is_man]
subset_2_2 = df_train[is_diabetes & dead & is_woman]

is_high_blood_pressure = df_train['high_blood_pressure']== 1
subset_3_1 = df_train[is_high_blood_pressure & dead & is_man]
subset_3_2 = df_train[is_high_blood_pressure & dead & is_woman]

is_smoking = df_train['smoking']== 1
subset_4_1= df_train[is_smoking & dead & is_man]
subset_4_2= df_train[is_smoking & dead & is_woman]


In [None]:
stages=["is_anaemia", "is_diabetes", "is_high_blood_pressure", "is_smoking"]

df_men = pd.DataFrame(dict(number=[len(subset_1_1), len(subset_2_1), len(subset_3_1), len(subset_4_1)], stage=stages))
df_men['office'] = 'Men'

df_women = pd.DataFrame(dict(number=[len(subset_1_2), len(subset_2_2), len(subset_3_2), len(subset_4_2)], stage=stages))
df_women['office'] = 'Women'

df = pd.concat([df_men, df_women], axis=0)
fig = px.funnel(df, x='number', y='stage', color='office')
fig.show()

In [None]:
px.scatter(df_train, x='creatinine_phosphokinase', y='platelets', color='time') #x=크레아틴 키나제, y=혈소판

In [None]:
px.scatter(df_train, x='time', y='ejection_fraction', size='age', hover_data=['serum_creatinine'], color='DEATH_EVENT')

plotly 연습 예제


In [None]:
import plotly.express as px

df = px.data.gapminder()
df.head()

In [None]:
canada = df[df['country']=='Canada']

fig = px.line(canada, x="year", y="lifeExp", title='Life expectancy in Canada')
fig.show()

In [None]:
fig = px.line(df, x="year", y="lifeExp", title='Life expectancy in Canada', color='continent', line_group='country', hover_name='country')
fig.show()

In [None]:
import plotly.graph_objects as go

import numpy as np
np.random.seed(1)

N = 100
random_x = np.linspace(0, 1, N)
random_y0 = np.random.randn(N) + 5
random_y1 = np.random.randn(N)
random_y2 = np.random.randn(N) - 5

fig = go.Figure()
fig.add_trace(go.Scatter(x=random_x, y=random_y0, mode='lines', name='lines'))
fig.add_trace(go.Scatter(x=random_x, y=random_y1, mode='lines+markers', name='lines+markers'))
fig.add_trace(go.Scatter(x=random_x, y=random_y2, mode='markers', name='markers'))
fig.show()

In [None]:
df_train.loc[(df_train['age'] >= 40) & (df_train['age'] < 50), 'age_band'] = '40~49'
df_train.loc[(df_train['age'] >= 50) & (df_train['age'] < 60), 'age_band'] = '50~59'
df_train.loc[(df_train['age'] >= 60) & (df_train['age'] < 70), 'age_band'] = '60~69'
df_train.loc[(df_train['age'] >= 70) & (df_train['age'] < 80), 'age_band'] = '70~79'
df_train.loc[(df_train['age'] >= 80) & (df_train['age'] < 90), 'age_band'] = '80~89'
df_train.loc[(df_train['age'] >= 90) & (df_train['age'] < 100), 'age_band'] = '90~100'

df_test.loc[(df_train['age'] >= 40) & (df_test['age'] < 50), 'age_band'] = '40~49'
df_test.loc[(df_train['age'] >= 50) & (df_test['age'] < 60), 'age_band'] = '50~59'
df_test.loc[(df_train['age'] >= 60) & (df_test['age'] < 70), 'age_band'] = '60~69'
df_test.loc[(df_train['age'] >= 70) & (df_test['age'] < 80), 'age_band'] = '70~79'
df_test.loc[(df_train['age'] >= 80) & (df_test['age'] < 90), 'age_band'] = '80~89'
df_test.loc[(df_train['age'] >= 90) & (df_test['age'] < 100), 'age_band'] = '90~100'

In [None]:
df_train.head()

In [None]:
import plotly.offline as pyo
import plotly.graph_objs as go

# 막대 그래프 그리기
trace1 = go.Bar(x=df_train['age_band'], y=df_train['ejection_fraction'])
data = [trace1]
layout = go.Layout(title='연령별 ejection_fraction')
fig = go.Figure(data=data, layout=layout)
pyo.iplot(fig)

In [None]:
# 하나의 사각형으로 하고 싶으면 pandas
df2 = df_train.groupby(by=['age_band', 'sex']).sum().reset_index()

trace2 = go.Bar(x=df2[df2['sex'] == 0].age_band, y=df2[df2['sex'] == 0].ejection_fraction, name='Female')
trace3 = go.Bar(x=df2[df2['sex'] == 1].age_band, y=df2[df2['sex'] == 1].ejection_fraction, name='Male')

data = [trace2, trace3]
layout = go.Layout(title='연령별 ejection_fraction')
fig = go.Figure(data=data, layout=layout)
pyo.iplot(fig)

In [None]:
from plotly.subplots import make_subplots
fig = make_subplots (rows = 2, cols = 2, specs = [[{ "type": "xy"}, { "type": "polar"}], 
                           [{ "type": "domain"}, { " type ":"장면 "}]])
fig.add_bar (row = 1, col = 1, y = [2, 3, 1],) 
fig.add_pie (row = 2, col = 1, values = [2, 3, 1]) 
fig.add_barpolar (row = 1, col = 2, theta = [0, 45, 90], r = [2, 3, 1]) 
fig.add_scatter3d (row = 2, col = 2, x = [2, 3], y = [ 0, 0], z = [0.5, 1]) 
fig.update_layout (height = 700, showlegend = False) 
fig.show ()

7/31~8/6_다시 heartFailure

In [None]:
def category_age(x):
    if x < 50:
        return '40~49'
    elif x < 60:
        return '50~59'
    elif x < 70:
        return '60~69'
    elif x < 80:
        return '70~79'
    elif x < 90:
        return '80~89'
    else:
        return '90~100'
df_train['age_band_2'] = df_train['age'].apply(category_age)

In [None]:
print('1번 방법, 2번 방법이 같은 결과를 내면 True -> ', (df_train['age_band'] == df_train['age_band_2']).all())

In [None]:
df_train.drop(['age_band_2'], axis=1, inplace=True)


In [None]:
df_train.head()

In [None]:
heatmap_data = df_train[['anaemia', 'diabetes', 'ejection_fraction', 'high_blood_pressure', 'serum_creatinine', 'serum_sodium', 'sex', 'smoking','DEATH_EVENT']] 

colormap = plt.cm.RdBu
plt.figure(figsize=(14, 12))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(heatmap_data.astype(float).corr(), linewidths=0.1, vmax=1.0,
           square=True, cmap=colormap, linecolor='white', annot=True, annot_kws={"size": 16})

del heatmap_data

feature들끼리 강한 상관관계가 거의 없음
DEATH_EVNET와 그나마 상관관계 있는 것은 sex와 smoking

In [None]:
import pandas as pd
df_train = pd.get_dummies(df_train, columns=['age_band'], prefix='age_band')
#df_test = pd.get_dummies(df_train, columns=['age_band'], prefix='age_band')

In [None]:
df_train.head()