In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import missingno as msno
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.metrics import roc_auc_score

# 1. Import Data

In [None]:
data = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
df_data = pd.DataFrame(data)
df_data

# 2.Preprocessing

In [None]:
# Search for missing data

msno.matrix(df=df_data, figsize=(20,14), color=(0,.3,.3))

In [None]:
# Fill the missing values with the median
df_data = df_data.fillna(df_data['bmi'].median())
df_data

In [None]:
# String label to categorical values

for i in range(df_data.shape[1]):
    if df_data.iloc[:,i].dtypes == object:
        lbl = LabelEncoder()
        lbl.fit(list(df_data.iloc[:,i].values))
        df_data.iloc[:,i] = lbl.transform(list(df_data.iloc[:,i].values))

df_data.info()

# 3. Check the correlation for each item

In [None]:
df_data_corr = df_data.corr()
df_data_corr

In [None]:
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(df_data_corr, vmax=.8, square=False, annot=True, cmap='Blues', fmt='1.3f')

In [None]:
df_data = df_data.drop('id', axis=1)
df_data

In [None]:
#sns.pairplot(df_data)
#plt.show()

# 4.Modeling with sckit-learn

In [None]:
x = df_data.drop('stroke', axis=1)
t = df_data['stroke']

# numpyの配列に変換
x = np.array(x)
t = np.array(t)

# numpyで型を変換
t = t.ravel()

x = x.astype('float32')
t = t.astype('int32')
# 中を確認
print('x shape:', x.shape)
print(x)
print('t shape:', t.shape)
print(t)

In [None]:
clf = RFC(n_estimators=192,
          criterion='gini',# 'gini' or 'entropy'
          max_depth=19,
          min_samples_split=2,
          max_features='auto',# 'auto'(='sqrt') or 'log2'
          n_jobs=-1,
          random_state=2525,
          verbose=1)# 0 or 1
clf.fit(x, t)

In [None]:
predict = clf.predict_proba(x)[:, 1] # This grabs the positive class prediction
score = roc_auc_score(t, predict)
print('score : ', '{:.5f}'.format(score))

In [None]:
plt.figure(figsize=(8,4))
plt.hist(predict[np.where(t == 0)], bins=100, alpha=0.75, label='neg class')
plt.hist(predict[np.where(t == 1)], bins=100, alpha=0.75, label='pos class')
plt.legend()
plt.show()