# INTRODUCTION

Objectives:
1. Выбрать данные - https://www.kaggle.com/ronitf/heart-disease-uci
2. Посчитать нижеперечисленное

Answer questions:
1. Среднее, медиана, дисперсия, мода. 
2. Skewness(коэффициент ассиметрии), kurtosis(коэффициент эксцесса)
3. Chi-squared test, Kolmogorov–Smirnov test
4. box cox transformation. Power transform
5. Корреляции - Пирсон, Спирман, Кендалл
6. Standard score, robust standard score


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import pandas_profiling

# Matplotlib forms basis for visualization in Python
import matplotlib.pyplot as plt

# We will use the Seaborn library
import seaborn as sns
sns.set()

# Graphics in retina format are more sharp and legible
%config InlineBackend.figure_format = 'retina'

#увеличим дефолтный размер графиков
from pylab import rcParams
rcParams['figure.figsize'] = 8, 5

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Read data
df = pd.read_csv("/kaggle/input/heart-disease-uci/heart.csv")

# First 5 rows of our data
df.head()

In [None]:
profile = pandas_profiling.ProfileReport(df)
profile

Data contains 14 columns: <br>

* age - age in years <br>
* sex - (1 = male; 0 = female) <br>
* cp - chest pain type <br>
* trestbps - resting blood pressure (in mm Hg on admission to the hospital) <br>
* chol - serum cholestoral in mg/dl <br>
* fbs - (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false) <br>
* restecg - resting electrocardiographic results <br>
* thalach - maximum heart rate achieved <br>
* exang - exercise induced angina (1 = yes; 0 = no) <br>
* oldpeak - ST depression induced by exercise relative to rest <br>
* slope - the slope of the peak exercise ST segment <br>
* ca - number of major vessels (0-3) colored by flourosopy <br>
* thal - 3 = normal; 6 = fixed defect; 7 = reversable defect <br>
* target - have disease or not (1=yes, 0=no)

# Initial exploration

In [None]:
#Count for target values
df.target.value_counts()

In [None]:
df.groupby('target').mean()

In [None]:
# Rows, columns:
df.shape

In [None]:
# summary of dataset
df.info()

In [None]:
# Следующую команду можно использовать частично для информации по №1 (но потом отдельно посчитаем)
df.describe()

# Some visualizations

In [None]:
features = ['trestbps', 'restecg','thalach']
df[features].hist(figsize=(10, 4));

In [None]:
#Целевой признак по возрасту
pd.crosstab(df.age,df.target).plot(kind="bar",figsize=(20,6))

In [None]:
# Целевой признак по гендерному признаку
pd.crosstab(df.sex,df.target).plot(kind="bar",figsize=(15,6),color=['#1CA53B','#AA1111' ])

# Исходя из графика - у женщин болезни сердца значительно чаще, чем у мужчин. 
# И больных больше, чем здоровых.

In [None]:
sns.distplot(df.chol)

In [None]:
# `pairplot()` may become very slow with the SVG or retina format
%config InlineBackend.figure_format = 'png'
sns.pairplot(df[['thalach','chol', 'trestbps', 'oldpeak']]);

In [None]:
%config InlineBackend.figure_format = 'retina'

In [None]:
# chol - serum cholestoral in mg/dl
# thalach - maximum heart rate achieved
# sex - (1 = male; 0 = female)

sns.lmplot('thalach', 'chol', data=df, hue='sex', fit_reg=False);


# 1. Среднее, медиана, дисперсия, мода

In [None]:
# Среднее для определенного столбца
df['age'].mean()

In [None]:
# Медиана аналогично
df['age'].median()

In [None]:
# Дисперсия
df.var()

In [None]:
# Дисперсия, plot
df.var().plot()

In [None]:
# Мода 
df.mode()

# Здесь получилось для холестерина наибольшее число строк сразу для трех значений - 197, 204, 234
# df[df['chol']==197].value_counts('chol')

# 2. Skewness(коэффициент ассиметрии), kurtosis(коэффициент эксцесса)

In [None]:
# Skewness для столбцов
# thalach - maximum heart rate achieved
# chol - serum cholestoral in mg/dl
df[['thalach','chol']].skew()

In [None]:
sns.distplot(df['thalach'])

In [None]:
sns.distplot(df['chol'])

In [None]:
# Kurtosis для столбцов
df.kurtosis()

In [None]:
features = ['thalach', 'chol']
df[features].hist(figsize=(10, 4));

In [None]:
# Box plot
sns.boxplot(x='thalach', data=df);

In [None]:
sns.boxplot(data=df[['thalach','chol', 'trestbps']]);

In [None]:
sns.violinplot(data=df['chol']);

In [None]:
# Scatter plot
plt.scatter(df['trestbps'], df['thalach']);

In [None]:
sns.jointplot(x='thalach', y='chol', 
              data=df, kind='scatter');

# 3. Chi-squared test, Kolmogorov–Smirnov test

**Chi-squared test**

In [None]:
# В пандасе не нашел подходящего, возьмем chi2_contingency из SciPy:

import scipy.stats as stats

In [None]:
stat, p, dof, expected = stats.chi2_contingency(df[['chol', 'thalach']])

In [None]:
print('dof=%d' % dof)

In [None]:
prob = 0.95
critical = stats.chi2.ppf(prob, dof)
print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))

In [None]:
# Если статистика >= критическое значение: 
# значимый результат, отвергнуть нулевую гипотезу (H0), в зависимости.
# Если статистика < Критическое значение: 
# несущественный результат, неспособность отклонить нулевую гипотезу (H0), независимая.
# Мы гипотезу принимаем (с уровнем значимости 0.05)
if abs(stat) >= critical:
    print('Dependent (reject H0)')
else:
    print('Independent (fail to reject H0)')

**Kolmogorov-Smirnov test**

In [None]:
# P значение больше 0.05 - гипотезу о нормальном распределении для df['chol'] отвергаем
stats.kstest(df['chol'], cdf='norm', args=(df['chol'].mean(), df['chol'].std()))



# Box Cox transformation(Power transform) 

In [None]:
from scipy.stats import boxcox

In [None]:
df['trestbps'].hist()

In [None]:
data=boxcox(df['trestbps'])

In [None]:
df['trestbps_bc']=data[0]

In [None]:
df['trestbps_y']=data[1]

In [None]:
df[['trestbps','trestbps_bc','trestbps_y']]

In [None]:
df[['trestbps','trestbps_bc']].hist()

# 5. Корреляции - Пирсон, Спирман, Кендалл

In [None]:
df.corr(method='pearson')

In [None]:
Var_Corr = df.corr(method='pearson')
# plot the heatmap and annotation on it
plt.subplots(figsize=(15,10))
sns.heatmap(Var_Corr, xticklabels=Var_Corr.columns, yticklabels=Var_Corr.columns, annot=True)

In [None]:
df.corr(method='spearman')

In [None]:
Var_Corr = df.corr(method='spearman')
# plot the heatmap and annotation on it
plt.subplots(figsize=(15,10))
sns.heatmap(Var_Corr, xticklabels=Var_Corr.columns, yticklabels=Var_Corr.columns, annot=True)

In [None]:
df.corr(method='kendall')

In [None]:
Var_Corr = df.corr(method='kendall')
# plot the heatmap and annotation on it
plt.subplots(figsize=(15,10))
sns.heatmap(Var_Corr, xticklabels=Var_Corr.columns, yticklabels=Var_Corr.columns, annot=True)

# 6. Standard score, robust standard score

In [None]:
# Compute the z score of each value in the sample, 
# relative to the sample mean and standard deviation.
stats.zscore(df, axis=1)

In [None]:
df.mad()