In [None]:
# https://www.kaggle.com/brsdincer/heart-attack-prediction-detailed-explanation
import numpy as np
import pandas as  pd
import matplotlib.pyplot as plt
from warnings import filterwarnings
from mpl_toolkits.mplot3d import Axes3D
import statsmodels.api as sm
import missingno as msno
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.neighbors import LocalOutlierFactor
from scipy.stats import levene
from scipy.stats import shapiro
from scipy.stats.stats import pearsonr
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, ShuffleSplit, GridSearchCV 
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import model_selection
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LinearRegression
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import ElasticNetCV
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
import xgboost as xgb
from xgboost import XGBRegressor, XGBClassifier
from lightgbm import LGBMRegressor, LGBMClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import tree
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_auc_score, roc_curve
import seaborn as sns

In [None]:
filterwarnings('ignore')

In [None]:
# DATA SOURCE
Heart = pd.read_csv('../input/heart-attack-analysis-prediction-dataset')
data = Heart.copy()

# ここでのカテゴリー型への変換は、箱ひげ図やバーの表示時に役に立つ
dataV  = data.copy()
dataV['sex'] = pd.Categorical(dataV['sex'])              
dataV['cp'] = pd.Categorical(dataV['cp'])
dataV['fbs'] = pd.Categorical(dataV['fbs'])
dataV['restecg'] = pd.Categorical(dataV['restecg'])
dataV['exng'] = pd.Categorical(dataV['exng'])
dataV['slp'] = pd.Categorical(dataV['slp'])
dataV['caa'] = pd.Categorical(dataV['caa'])
dataV['thall'] = pd.Categorical(dataV['thall'])
dataV['output'] = pd.Categorical(dataV['output'])

df = data.select_dtypes(include=['float64', 'int64', 'int32'])     # 指定したタイプのデータを抽出

In [None]:
# pd.Categorical()     # category型に変換
"""
例
pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'])
['a', 'b', 'c', 'a', 'b', 'c']
Categories (3, object): ['a', 'b', 'c']
"""

In [None]:
# INFORMATIONS

In [None]:
"""
About dataset
Age : Age of the patient

Sex : Sex of the patient

exang: exercise induced angina (1 = yes; 0 = no)

ca: number of major vessels (0-3)

cp : Chest Pain type chest pain type

Value 1: typical angina
Value 2: atypical angina
Value 3: non-anginal pain
Value 4: asymptomatic
trtbps : resting blood pressure (in mm Hg)

chol : cholestoral in mg/dl fetched via BMI sensor

fbs : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)

rest_ecg : resting electrocardiographic results

Value 0: normal
Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
thalach : maximum heart rate achieved

target : 0= less chance of heart attack 1= more chance of heart attack
"""

In [None]:
print(data.shape)
print('------'*20)
print(data.columns)
print('------'*20)
print(data.info())
print('------'*20)
print(data.describe())
print('------'*20)
print(data.corr())      # 各列の間の相関係数が算出される (heatmapで可視化可能(seaborn))
print('------'*20)
print(data['sex'].value_counts())      # value_counts(): それぞれの値の出現回数をカウントしてくれる。
print('------'*20)
print(data['cp'].value_counts())    
print('------'*20)
print(data['fbs'].value_counts())
print('------'*20)
print(data['restecg'].value_counts())
print('------'*20)
print(data['exng'].value_counts())
print('------'*20)
print(data['slp'].value_counts())
print('------'*20)
print(data['caa'].value_counts())
print('------'*20)
print(data['thall'].value_counts())
print('------'*20)
print(data['output'].value_counts())
print('------'*20)
print(data.groupby(['sex', 'output']).mean(['trtbps']))    # resting blood pressure (in mm Hg)
print('------'*20)
print(data.groupby(['sex', 'output']).mean(['chol']))     # cholestoral in mg/dl fetched via BMI sensor
print('------'*20)
print(data.groupby(['sex', 'output']).mean(['thalachh']))   # maximum heart rate achieved
print('------'*20)
print(data.groupby(['sex', 'output']).mean(['oldpeak']))
print('------'*20)
print(data.isnull().sum())     # Nanの数を調べている

In [None]:
# CORRELATION - NORMALITY - HOMOGENEITY
corrPearson = data.corr(method='pearson')    # ピアソンの積率相関係数の無相関検定を行う(関係は線形)
corrSpearman = data.corr(method='spearman')   # スピアマンの順位相関係数の無相関検定を行う(単調（スピアマン）関係)

# 上記の違い: https://support.minitab.com/ja-jp/minitab/18/help-and-how-to/statistics/basic-statistics/supporting-topics/correlation-and-covariance/a-comparison-of-the-pearson-and-spearman-correlation-methods/

In [None]:
# 無相関検定  :   https://liginc.co.jp/305968
"""
ある標本をとって、その相関係数を求めたときに、その相関係数に意味があるのかどうかを決めることを「無相関検定」
"""

In [None]:
# PEARSON CORRELATION
figure = plt.figure(figsize=(10, 8))  # 幅、高さ
sns.heatmap(corrPearson, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)  # annotで数値を表示するかを指定。vmin, vmax でカラーの幅を指定
plt.title('PEARSON')
plt.xlabel('COLUMNS')
plt.ylabel('COLUMNS')
plt.show()

In [None]:
# SPEARMAN CORRELATION
figure = plt.figure(figsize=(10, 8))
sns.heatmap(corrSpearman, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('SPEARMAN')
plt.xlabel('COLUMNS')
plt.ylabel('COLUMNS')
plt.show()

In [None]:
# NORMALITY
for i in data.columns:
    print('-----'*10)
    print('%.3f - %.3f' % shapiro(data[i]))    #シャピロ–ウィルク検定:  統計学において、標本 x1, ..., xn が正規分布に従う母集団からサンプリングされたものであるという帰無仮説を検定する検定
    
# 検定統計量, p値

In [None]:
# HOMOGENEITY  : いくつかの群の分散が等しいかどうかを検定すること。検定統計量がF分布に従うのでF検定とも呼ばれる
print('%.4f - %.4f' % levene(data["age"],data["sex"],data["cp"],data["trtbps"],data["chol"],
                             data["fbs"],data["restecg"],data["thalachh"],data["exng"],data["oldpeak"],
                             data["slp"],data["caa"],data["thall"],data["output"]))

In [None]:
levene(data["age"],data["sex"],data["cp"],data["trtbps"],data["chol"],
                             data["fbs"],data["restecg"],data["thalachh"],data["exng"],data["oldpeak"],
                             data["slp"],data["caa"],data["thall"],data["output"])
# F値, P値

In [None]:
# VISUALIZATION
# HIST
data.hist(figsize=(20, 10))
plt.show()

In [None]:
# BOX
figure = plt.figure(figsize=(20, 8))
sns.boxplot(x='trtbps', y='output', data=dataV)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.boxplot(x='chol', y='output', data=dataV)
plt.show()

In [None]:
figure = plt.figure(figsize=(20, 8))
sns.boxplot(x='thalachh', y='output', data=dataV)
plt.show()

In [None]:
figure = plt.figure(figsize=(20, 8))
sns.boxplot(x='oldpeak', y='output', data=dataV)
plt.show()

In [None]:
figure = plt.figure(figsize=(20, 8))
sns.boxplot(x='age', y='output', data=dataV)
plt.show()

In [None]:
# data=dataとしたケース
figure = plt.figure(figsize=(20, 8))
sns.boxplot(x='age', y='output', data=data)
plt.show()

# こうなる原因はoutputの列が数値として認識され、数値の幅を考慮して出してしまうからと考えられる。(category型にする必要あり(非連続的な値として認識させるため))

In [None]:
# BAR
figure = plt.figure(figsize=(20, 8))
sns.barplot(x='sex', y='output', data=data)      # data=dataVだと、数値ではないとしてエラーとなる(category型のためと思われる)
# sns.barplot(): xごとにyの平均値を表示

plt.show()

In [None]:
figure = plt.figure(figsize=(20, 8))
sns.barplot(x='cp', y='output', data=data)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.barplot(x='fbs', y='output', data=data)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.barplot(x='restecg', y='output', data=data)
plt.show()

In [None]:
figure = plt.figure(figsize=(20, 8))
sns.barplot(x='exng', y='output', data=data)
plt.show()

In [None]:
figure = plt.figure(figsize=(20, 8))
sns.barplot(x='slp', y='output', data=data)
plt.show()

In [None]:
figure = plt.figure(figsize=(20, 8))
sns.barplot(x='caa', y='output', data=data)
plt.show()

In [None]:
figure = plt.figure(figsize=(20, 8))
sns.barplot(x='thall', y='output', data=data)
plt.show()

In [None]:
# LINE
rand = 0
for i in data.columns:
    rand += 1
    if rand < len(data.columns):    #outputのコラムを除くすべてのコラムに対して行うという意味
        figure = plt.figure(figsize=(20, 8))
        sns.lineplot(x='output', y=i, data=data)
        plt.show()
    else:
        break

In [None]:
# 3D
#%matplotlib inline
#%matplotlib nbagg
fig = plt.figure(figsize=(10, 10))
ax = Axes3D(fig)
ax.set_xlabel('OUTPUT', fontsize=20)
ax.set_ylabel('TRTBPS', fontsize=20)
ax.set_zlabel('CHOL', fontsize=20)
ax.scatter(xs=dataV['output'], ys=dataV['trtbps'], zs=dataV['chol'], c='red', s=20, alpha=0.5)   # sで点の大きさを指定, alphaで点の濃さを指定
plt.show()

In [None]:
fig = plt.figure(figsize=(10, 10))
ax = Axes3D(fig)
ax.set_xlabel('OUTPUT', fontsize=20)
ax.set_ylabel('THALACHH', fontsize=20)
ax.set_zlabel('OLDPEAK', fontsize=20)
ax.scatter3D(dataV['output'], dataV['thalachh'], dataV['oldpeak'], c='green', s=20, alpha=0.5)
plt.show()

In [None]:
# AGAINST VALUES
DataForA = data.copy()

In [None]:
clf = LocalOutlierFactor()   # 外れ値検知  in detail: https://hktech.hatenablog.com/entry/2018/09/04/002034
clf.fit_predict(DataForA)

# 1は外れ値でないサンプル(inlier)、-1は外れ値(outlier)

In [None]:
score = clf.negative_outlier_factor_
scoresorted = np.sort(score)
print(scoresorted[:30])

In [None]:
point = scoresorted[12]
print(DataForA[score == point])

In [None]:
against = DataForA < point
print(DataForA[against].notna().sum())     # notna()で欠損値を調べる(NanであればFalse)

In [None]:
values = DataForA > point
print(DataForA[values].notna().sum())

In [None]:
# MODELS
# X & Y FOR MODELS
x = data.drop(['output'], axis=1)
y = data['output']

xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size= 0.2, random_state=42)

In [None]:
# REGRESSION MODELS
lm = LinearRegression().fit(xTrain, yTrain)
pls = PLSRegression().fit(xTrain, yTrain)     # https://punhundon-lifeshift.com/pca_pls :サンプルサイズが小さかったり、多重共線性が考えられる場合に有効な分析方法の一つ
ridge =  Ridge().fit(xTrain, yTrain)   #https://qiita.com/K_Noguchi/items/3f5cf527d6f6d46767fb#:~:text=%E3%83%AA%E3%83%83%E3%82%B8%E5%9B%9E%E5%B8%B0%E3%81%A8%E3%81%AF%E9%81%8E,%E4%B8%80%E3%81%A4%E3%81%A8%E3%82%82%E8%A8%80%E3%81%88%E3%81%BE%E3%81%99%E3%80%82 過学習を抑える手法の一つ 
lasso = Lasso().fit(xTrain, yTrain)   # いらない特徴量を削る
elasticnet = ElasticNet().fit(xTrain, yTrain)    # https://leck-tech.com/machine-learning/elastic-net#Lasso Ridge回帰とLasso回帰のハイブリッドのような形を取っているElasticNet
knnr = KNeighborsRegressor().fit(xTrain, yTrain)   # k最近傍法
cartr = DecisionTreeRegressor(random_state=42).fit(xTrain, yTrain)   # 決定木の回帰木
baggr = BaggingRegressor(random_state=42, bootstrap_features=True, verbose=False).fit(xTrain, yTrain)
rfr = RandomForestRegressor(random_state=42, verbose=False).fit(xTrain, yTrain)     # https://qiita.com/yshi12/items/6d30010b353b084b3749
gbmr = GradientBoostingRegressor(verbose=False).fit(xTrain, yTrain)       # https://qiita.com/nazoking@github/items/51a46256ecda598b60dd  勾配ツリーブースト または勾配ブースト回帰ツリー（GBRT）は、任意の微分可能な損失関数にブーストする一般化
xgbr = XGBRegressor().fit(xTrain, yTrain)    # http://tekenuko.hatenablog.com/entry/2016/09/22/220814
lgbmr = LGBMRegressor().fit(xTrain, yTrain)    # https://www.codexa.net/lightgbm-beginner/  
catbr = CatBoostRegressor(verbose=False).fit(xTrain, yTrain)   # https://ryucoding.com/programming/catboost-beginner  カテゴリー変数に強い

In [None]:
# COMPARISON
models = [lm, pls, ridge, lasso, knnr, cartr, baggr, rfr, gbmr, xgbr, lgbmr, catbr]

In [None]:
for model in models:
    name = model.__class__.__name__
    R2CV = cross_val_score(model, xTest, yTest, cv=10, scoring='r2').mean()  # https://docs.pyq.jp/python/machine_learning/glossary/cross_validation.html  クロスバリデーション
    error = -cross_val_score(model, xTest, yTest, cv=10, scoring='neg_mean_squared_error').mean()
    print(name + ':')
    print('-'*10)
    print(R2CV)          # クロスバリデーションの決定係数（Ｒ2CV）は普通０〜１の範囲の値をとり、値が大きいほどモデルが適切にデータを表現できている
    print(np.sqrt(error))    # np.sqrt   平方根
    print('-'*30)

In [None]:
r = pd.DataFrame(columns=['MODELS', 'R2CV'])
for model in models:
    name = model.__class__.__name__
    R2CV = cross_val_score(model, xTest, yTest, cv=10, scoring='r2').mean()
    result = pd.DataFrame([[name, R2CV*100]], columns=['MODELS', 'R2CV'])
    r = r.append(result)
    
figure = plt.figure(figsize=(20, 8))
sns.barplot(x='R2CV', y='MODELS', data=r, color='k')
plt.xlabel('R2CV', fontsize=20)
plt.ylabel('MODELS', fontsize=20)
plt.xlim(-50, 100)
plt.title('MODEL ACCURACY COMPARISON')
plt.show()

In [None]:
# SPECIAL REGRESSION MODELS
# OLS
ols = sm.OLS(yTrain, xTrain).fit()
ols.summary()

In [None]:
# PCA
pca = PCA()      # https://qiita.com/maskot1977/items/082557fcda78c4cdb41f  多変量解析手法のうち次元削減手法としてよく用いられる手法の一種
xRTrain = pca.fit_transform(scale(xTrain))
xRTest = pca.fit_transform(scale(xTest))

In [None]:
lmP = LinearRegression().fit(xRTrain, yTrain)
R2CV = cross_val_score(lmP, xRTest, yTest, cv=10, scoring='r2').mean()
error = -cross_val_score(lmP, xRTest, yTest, cv=10, scoring='neg_mean_squared_error').mean()
print(R2CV)
print('-----'*10)
print(np.sqrt(error))

In [None]:
lmP = LinearRegression().fit(xRTrain, yTrain)
R2CV = cross_val_score(lmP, xRTest, yTest, cv=10, scoring='r2').mean()
error = -cross_val_score(lmP, xRTest, yTest, cv=10, scoring='neg_mean_squared_error').mean()
print(R2CV)
print('-----'*10)
print(np.sqrt(error))

In [None]:
mlpr = MLPRegressor().fit(xRTrain, yTrain)

R2CV = cross_val_score(mlpr, xRTest, yTest, cv=10, scoring='r2').mean()
error = -cross_val_score(mlpr, xRTest, yTest, cv=10, scoring='neg_mean_squared_error').mean()

print(R2CV)
print('-----'*10)
print(np.sqrt(error))

In [None]:
# CLASSIFICATION MODELS
lj = LogisticRegression(solver='liblinear').fit(xTrain, yTrain)
gnb = GaussianNB().fit(xTrain,  yTrain)
knnc = KNeighborsClassifier().fit(xTrain, yTrain)
cartc = DecisionTreeClassifier(random_state=42).fit(xTrain, yTrain)
rfc = RandomForestClassifier(random_state=42, verbose=False).fit(xTrain, yTrain)
gbmc = GradientBoostingClassifier(verbose=False).fit(xTrain, yTrain)
xgbc = XGBClassifier().fit(xTrain, yTrain)
lgbmc = LGBMClassifier().fit(xTrain, yTrain)
catbc = CatBoostClassifier(verbose=False).fit(xTrain, yTrain)

In [None]:
# COMPARISON
modelsc = [lj, gnb, knnc, cartc, rfc, gbmc, xgbc, lgbmc, catbc]

In [None]:
for model in modelsc:
    name = model.__class__.__name__
    predict = model.predict(xTest)
    R2CV = cross_val_score(model, xTest, yTest, cv=10, verbose=False).mean()
    error = -cross_val_score(model, xTest, yTest, cv=10, scoring='neg_mean_squared_error', verbose=False).mean()
    print(name + ':')
    print('-'*10)
    print(accuracy_score(yTest, predict))
    print(R2CV)
    print(np.sqrt(error))
    print('-'*30)

In [None]:
r = pd.DataFrame(columns=['MODELS', 'R2CV'])
for model in modelsc:
    name = model.__class__.__name__
    R2CV = cross_val_score(model, xTest, yTest, cv=10, verbose=False).mean()
    result = pd.DataFrame([[name, R2CV*100]], columns=['MODELS', 'R2CV'])
    r = r.append(result)
    
figure = plt.figure(figsize=(20, 8))
sns.barplot(x='R2CV', y='MODELS', data=r, color='k')
plt.xlabel('R2CV', fontsize=20)
plt.ylabel('MODELS', fontsize=20)
plt.xlim(0, 100)
plt.title('MODEL ACCURACY COMPARISON')
plt.show()

In [None]:
# LGBMClassifier is the best

In [None]:
# SPECIAL CLASSIFICATION MODELS
# ANN C MODELS & ERROR & TUNING & PREDICT
scaler = StandardScaler().fit(xTrain, yTrain)
xRTrain = scaler.transform(xTrain)
xRTest = scaler.transform(xTest)

In [None]:
mlpc = MLPClassifier().fit(xRTrain, yTrain)
predict = mlpc.predict(xRTest)

R2CV = cross_val_score(mlpc, xRTest, yTest, cv=10).mean()
print(R2CV)
error = mean_squared_error(yTest, predict)
print(np.sqrt(error))

In [None]:
# TUNING FOR BEST MODEL
params = {"n_estimators": [100, 500, 1000, 2000],
          "subsample": [0.6, 0.8, 1.0],
          "max_depth": [3, 4, 5, 6],
          "learning_rate": [0.1, 0.01, 0.02, 0.05],
          "min_child_samples": [5, 10, 20]}

In [None]:
cv = GridSearchCV(lgbmc, params, cv=10, verbose=False, n_jobs=-1).fit(xTrain, yTrain)
print(cv.best_params_)
print(cv.best_score_)

In [None]:
# FINAL MODEL
lgbmctuned = LGBMClassifier(learning_rate=0.01, max_depth=5, min_child_samples=10, 
                            n_estimators=100, subsample=0.6).fit(xTrain, yTrain)

R2CVtuned = cross_val_score(lgbmctuned, xTest, yTest, cv=10).mean()
print(R2CVtuned)
errortuned = -cross_val_score(lgbmctuned, xTest, yTest, cv=10, scoring='neg_mean_squared_error').mean()
print(np.sqrt(errortuned))