In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt 
import seaborn as sns 
from collections import Counter 
plt.style.use("seaborn-whitegrid")
import plotly as py 
import plotly.graph_objs as go 
import plotly.figure_factory as ff 
from plotly.offline import init_notebook_mode,iplot
init_notebook_mode(connected=True)
import missingno as msno

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# ABOUT DATA INFORMATIONS
## FEATURES

* age
* sex       : 1= male, 0= female
* cp        : Types of Chest Pain 
                -1:Typical Angina(Usually pain, compression, pressure in chest, caused by ischemia or artery spasm in the heart muscle.)
                -2:Variant Angina(It is a type of angina characterized by temporary chest pain and ST elevation in ECG.)
                -3:Non-anginal pain
                -4:Asymptomatic         
* trestps   : Resting blood preasure
* chol      : Cholesterol measurement
* fbs       : Fasting blood sugar>120
                -1=True
                -0=False
* restecg   : Resting electrocardiographic measurement
                -0:Normal 
                -1:Having ST-T wave abnormality
                -2:Showing probable or definite left ventricular hypertrophy by Estes' criteria
* thalach   : Maximum heart rate achieved
* exang     : Exercise induced angina (chest pain caused by decreased blood flow to the heart muscle)
                -1:Yes
                -2:No
* oldpeak   : ST depression induced by exercise relative to rest 
* slope     : The slope of the peak exercise ST segment 
                -1:upsloping
                -2:flat
                -3:downsloping
* ca        : The number of major vessels(0-3) sayısı)
* thal      : It is an imaging method that helps to determine whether a person has coronary artery disease (CAD) by evaluating the amount of blood reaching the heart muscle.
                -3:Normal
                -6:Fixed defect
                -7:Reversable defect)
* target    : Heart Disease
                -1:Yes 
                -0:No

In [None]:
heart_case=pd.read_csv("/kaggle/input/heart-disease-uci/heart.csv")


## General Information About Columns
When looks in a general way as follow informations, it was observed that: 
* There were no NaN variables in data 
* There were no object kind of variable
* One variable(oldpeak) is float, others are int variable

In [None]:
heart_case.info()
heart_case.isnull().values.any()

In [None]:
# The data which whe obtained has been visualizations as follows
# Visualization of NaN values in data 
msno.bar(heart_case)
plt.show()

### Pandas_Profiling library was used for obtaining more information from data.

In [None]:
import pandas_profiling as pp
pp.ProfileReport(heart_case)


## Analysis of Data
#### As seen above
* Numeric : 6 
* Categorical : 4 
* Boolean : 4 

#### There is no NaN data in the Heart_Case. But
   * 1 tane duplicate 
   * ca : 175 zeros datas 
   * oldpeak : 99 zeros datas 

#### Types of data 
   * oldpeak : float
   * others : int 


In [None]:
plt.figure(figsize=(11,11))
sns.heatmap(heart_case.corr(),annot=True, fmt='.1f', cmap="coolwarm")
plt.show()

### Considering  the correlation table above, are taken into consideration when values greater than 0.2 and less than -0.2.

#### Direct Correlated
* age - trestbps (.3)
* age - ca (.3)
* cp - thalach (.3)
* cp - target (.4)
* thalach - slope (.4) 
* thalach - target (.4)
* exang - oldpeak (.3)
* slope - target (.3)

#### Inversely Correlated
* age - thalach (-.4)
* sex - target (-.3)
* cp - exang (-.4)
* thalach - exang (-.4)
* thalach - oldpeak (-.3)
* exang -slope (-.3)
* exang - target (-.4)
* oldpeak - slope (-.6)
* oldpeak - target (-.4)
* ca - target (-.4)

#### We can say that fbs has no connection with other features, except trestbps (.2).
#### Also we can  say that restecg has no connection with other features, except chol(-.2).
 

1. Categorical Features : sex, cp, fbs, restecg, exang, slope, thal, target, ca
1. Numerical Features : age, trestbps, chol, thalach, oldpeak

In [None]:
                        # CATEGORICAL VARIABLE
categorical_features=["sex","cp","restecg","exang","slope","thal","target","ca"]
def visualization_bar(features):
    """
    Visualization Categorical Features with Bar Plot. 
    Will be used categorical_features 
    """
    seri=heart_case[features]
    unique_variable=seri.value_counts().index
    variable_counts=seri.value_counts()
    
    plt.figure(figsize=(7,7))
    sns.barplot(unique_variable,variable_counts)
    plt.xticks(rotation=45)
    plt.title("VISUALİATİON: *{} FEATURE".format(features.upper()))
    plt.xlabel("***{}***".format(unique_variable))
    plt.ylabel("{} 's Counts".format(features))
    plt.show()
    
    print("{}".format(variable_counts))
    
    
    for i in unique_variable:
        filtre=heart_case[heart_case[features]==i]
        mean=len(filtre)/len(seri)
        print("{}: {}".format(i,round(mean,2)))
    
    
for i in categorical_features:
    visualization_bar(i)
    

In [None]:
               #      VISUALIZATION WITH PIE PLOTS FOR CATEGORICAL VARIABLE
for i in categorical_features:
    unique_value=heart_case[i].value_counts().index
    unique_label=list(heart_case[i].unique())
    unique_label_index=[unique_label.index(x) for x in unique_label ]
    size=[len(heart_case[heart_case[i]==x]) for x in unique_value]
    labels=[str(unique_value[x]) for x in unique_label_index]
    explode=[0 for x in unique_label_index]
    
    f,ax=plt.subplots(figsize=(9,9))
    ax.pie(size,labels=labels,explode=explode, autopct="%1.1f%%",shadow=True)
    ax.axis("equal")
    plt.title("{}".format(i.upper()))
    plt.show()
    

In [None]:
heart_case_male=heart_case[heart_case.sex==1]
heart_case_female=heart_case[heart_case.sex==0]

In [None]:
                    # DISTRIBUTION OF CATEGORICAL DATA ACCORDING TO FEMALE DATA
categorical_features=["cp","restecg","exang","slope","thal","target","ca"]
for i in categorical_features:
    seri=heart_case_female[i]
    unique_value=seri.unique()
    print("Mean of Female's Data: {} ".format(i) )
    mean_list=[]
    for a in unique_value:
        mean=round(len(heart_case_female[seri==a])/len(seri),2)
        mean_list.append(mean)
        print("{}: {}".format(a,mean,))
        
    sns.barplot(unique_value,mean_list)
    plt.title("{} Bar Plot".format(i.upper()))
    plt.show()

In [None]:
                        # DISTRIBUTION OF CATEGORICAL DATA ACCORDING TO MALE DATA
categorical_features=["cp","restecg","exang","slope","thal","target","ca"]
for i in categorical_features:
    seri=heart_case_male[i]
    unique_value=seri.unique()
    print("Mean of Male's Data: {}".format(i) )
    mean_list=[]
    for a in unique_value:
        mean=round(len(heart_case_male[seri==a])/len(seri),2)
        mean_list.append(mean)
        print("{}: {}".format(a,mean,))
        
    sns.barplot(unique_value,mean_list)
    plt.title("{} Bar Plot".format(i.upper()))
    plt.show()

* When the visualization results above are analyzed in general, the results as follows are obtained.
* Data's: %68(Male) / %32(Famale)
* 47% of Chest Pain (cp) belongs to type cp: 0.
            The correlation between cp and sex is 0 and the highest rate in both male and female belongs to cp: 0. 
* While the rate of being sick or not is almost equal in male
* In Female, this rate was 75% as a patient(1).         

In [None]:
            # FOR FEMALE
numeriacal_features=["age", "trestbps", "chol", "thalach", "oldpeak"]
for i in numeriacal_features:
    plt.figure(figsize=(20,6))
    plt.subplot(2,2,1)
    sns.swarmplot(data=heart_case_female, x="cp",y=i,hue="target",size=10)
    plt.title("{}-CP".format(i.upper()))
    plt.show()
    
    plt.figure(figsize=(20,6))
    plt.subplot(2,2,2)
    sns.countplot(x=i,data=heart_case_female,hue="target", palette="GnBu", linewidth=3)
    plt.legend(loc="upper right")
    plt.title("{}'s HISTOGRAM".format(i.upper()))
    plt.show()
    
    plt.figure(figsize=(20,6))
    plt.subplot(2,2,3)
    sns.countplot(x=i,data=heart_case,hue="target", palette="GnBu", linewidth=3)
    plt.legend(loc="upper right")
    plt.title("{}'s HISTOGRAM".format(i.upper()))
    plt.show()
    
    mean_all_data=heart_case[i].mean()
    mean=heart_case_female[i].mean()
    mean_1=heart_case_female[i][heart_case_female["target"]==1].mean()
    mean_0=heart_case_female[i][heart_case_female["target"]==0].mean()
    print("Mean of {}: {}".format(i.upper(),round(mean,2)))
    print("All Data Mean of {}: {}".format(i.upper(),round(mean_all_data,2)))
    print("Mean of {}: {} for target(1)".format(i.upper(),round(mean_1,2)))
    print("Mean of {}: {} for target(0)".format(i.upper(),round(mean_0,2)))
    

In [None]:
            # FOR MALE
numeriacal_features=["age", "trestbps", "chol", "thalach", "oldpeak"]
for i in numeriacal_features:
    plt.figure(figsize=(10,6))
    plt.subplot(2,1,1)
    sns.swarmplot(data=heart_case_male, x="cp",y=i,hue="target",size=10)
    plt.title("{}-CP".format(i.upper()))
    plt.show()
    
    plt.figure(figsize=(10,6))
    plt.subplot(2,1,2)
    sns.countplot(x=i,data=heart_case_male,hue="target", palette="GnBu", linewidth=3)
    plt.legend(loc="upper right")
    plt.title("{}'s HISTOGRAM".format(i.upper()))
    plt.show()
    
    plt.figure(figsize=(20,6))
    plt.subplot(2,2,3)
    sns.countplot(x=i,data=heart_case,hue="target", palette="GnBu", linewidth=3)
    plt.legend(loc="upper right")
    plt.title("{}'s HISTOGRAM".format(i.upper()))
    plt.show()
    
    mean_all_data=heart_case[i].mean()
    mean=heart_case_male[i].mean()
    mean_1=heart_case_male[i][heart_case_male["target"]==1].mean()
    mean_0=heart_case_male[i][heart_case_male["target"]==0].mean()
    print("Mean of {}: {}".format(i.upper(),round(mean,2)))
    print("All Data Mean of {}: {}".format(i.upper(),round(mean_all_data,2)))
    print("Mean of {}: {} for target(1)".format(i.upper(),round(mean_1,2)))
    print("Mean of {}: {} for target(0)".format(i.upper(),round(mean_0,2)))

### When the above image is examined, almost all of the patients are observed at ca: 0. 
* So here we can comment as ca: 0 -> patient and ca: (1,2,3) -> not patient. 
### When look at the above age histograms, we can say that the tendency that to be sick or not is similar for female and male
* Also we can say that the peak values(max or min) is same for both of them. 

In [None]:
                                # NUMERICAL VARIABLE
    
numeriacal_features=["age", "trestbps", "chol", "thalach", "oldpeak", "ca"]
for i in numeriacal_features:
    plt.figure(figsize=(11,15))
    
    plt.subplot(2,2,1)
    plt.hist(heart_case[i],color="r")
    plt.title("HISTOGRAM {}".format(i.upper()))
    plt.xlabel("{}".format(i.upper()))
    
    plt.subplot(2,2,2)
    plt.scatter(heart_case[heart_case.sex==0].age, heart_case[heart_case.sex==0][i])
    plt.title("Comparasion Between Famale Age vs {}".format(i.capitalize()))
    plt.xlabel("Age")
    plt.ylabel(i)
    
    plt.subplot(2,2,3)
    plt.scatter(heart_case[heart_case.sex==1].age, heart_case[heart_case.sex==1][i])
    plt.title("Comparasion Between Male Age vs {}".format(i.capitalize()))
    plt.xlabel("Age")
    plt.ylabel(i)
    
    plt.subplot(2,2,4)
    sns.swarmplot(data=heart_case, x="cp",y=i, hue="target")
    plt.show()

In [None]:
# EXAMINE SEX/ NUMERIC DATAS WITH GROUP BY 
group_by_numerical=heart_case.groupby("sex")["age", "trestbps", "chol", "thalach", "oldpeak", "ca"].mean().sort_values(by="age",ascending=False)
group_by_numerical.index=["male","female"]
group_by_numerical.index.name="sex"
round(group_by_numerical,2)

In [None]:
# EXAMINE SEX/ NUMERIC DATAS WITH GROUP BY
group_by_numerical=heart_case.groupby("sex")["age", "trestbps", "chol", "thalach", "oldpeak", "ca"].max().sort_values(by="age",ascending=False)
group_by_numerical.index=["male","female"]
group_by_numerical.index.name="sex"
round(group_by_numerical,2)

In [None]:
# EXAMINE SEX/ NUMERIC DATA WITH GROUP BY
group_by_numerical=heart_case.groupby("sex")["age", "trestbps", "chol", "thalach", "oldpeak", "ca"].min().sort_values(by="age",ascending=False)
group_by_numerical.index=["male","female"]
group_by_numerical.index.name="sex"
round(group_by_numerical,2)

### Look at the above data
#### TRESTPS
1.      trestps (120-140) shows a scattered structure between 40-70 (age) in female 
1.      It can be said that trestps(120-140) is concentrated between 50-60 (age) in male.
#### CHOL
1. While chol has a scatered distribution accoording to age on the basis female/male, when looking at the visualizations, there may be a connection relationship between chol and sex.
                     * NOT:will analyse sex and chol
                   
#### THALACH 
1. While thalach has a scatered distribution accoording to age on the basis female/male, when looking at the visualizations, there may be a connection relationship between thalach and sex.
                    * NOT:will analyse sex and thalach  
#### CA
1. In famale, it can be said that ca (0,1,2,3) is generally seen between 60-70(age) while ca (0) is only seen between 0-50(age). 
1. In male, it can be said that ca (0,1,2,3) is generally seen between 40-70(age) while ca (0) is only seen between 0-40(age). 


In [None]:
                        #  BOX PLOT FOR NUMERICAL VARIABLE
    
numeriacal_features=["age", "trestbps", "chol", "thalach", "oldpeak", "ca"]
for i in numeriacal_features:
    heart_case.boxplot(column=i, by="sex")
    plt.show()


In [None]:
numeriacal_features=["age", "trestbps", "chol", "thalach", "oldpeak", "ca"]
for i in numeriacal_features:
    heart_case.boxplot(column=i, by="cp")
    plt.show()


### When the boxplot was drawn for sex and cp, it was observed that none of the numerical data had outliers values. 


### Before visualizing  in the Seaborn and Plotly libraries ,using the inferences that we found by using the Matplotlib library above, if data have a outlier, firstly we  cleaning this data after visualizing the over new data

In [None]:
variable=["age","trestbps","chol","thalach","oldpeak"]
def outlier_detect(df, features):
    """
    Herhangi bir data içerisinde bulunan outlier ların testpit edilmesini sağlar.
    """
    outlier_index=[]
    for i in features:
        Q1=np.percentile(df[i],25)
        Q3=np.percentile(df[i],75)
        IQR=Q3-Q1 
        outlier_step=IQR*1.5
        filter_data=df[(df[i]<Q1-outlier_step)|(df[i]>Q3+outlier_step)]
        filter_data_index=filter_data.index
        outlier_index.extend(filter_data_index)
    
    outlier_index_count=Counter(outlier_index)
    multi_index=[i for i, v in outlier_index_count.items() if v >2 ]
    return multi_index   

In [None]:
outlier_detect(heart_case,["age","trestbps","chol","thalach","oldpeak"])

### As above, "outlier_detect" function is prepared and when variable is ["age "," trestbps "," chol "," thalach "," oldpeak "], it is returned as []. Therefore, visualization in the seaborn and plotly libraries will be done over the available data.

In [None]:
Age= heart_case["age"]
Trestbps=heart_case["trestbps"]/heart_case["trestbps"].max()
Chol=heart_case["chol"]/heart_case["chol"].max()
Thalach=heart_case["thalach"]/heart_case["thalach"].max()
Oldpeak=heart_case["oldpeak"]/heart_case["oldpeak"].max()
Age_Trestbps=pd.concat([Age,Trestbps,Chol,Thalach,Oldpeak],axis=1)

In [None]:
plt.figure(figsize=(15,11))
sns.pointplot(data=Age_Trestbps,x="age",y="trestbps")
sns.pointplot(data=Age_Trestbps, x="age",y="chol",color="r")
sns.pointplot(data=Age_Trestbps,x="age",y="thalach",color="g")
sns.pointplot(data=Age_Trestbps,x="age",y="oldpeak",color="cyan")
plt.text(45,.8, "*thalach",color="g",style="italic",size=20)
plt.text(45,.75, "*trestbps",color="b",style="italic",size=20)
plt.text(45,.7, "*chol",color="r",style="italic",size=20)
plt.text(45,.65, "*oldpeak",color="cyan",style="italic",size=20)
plt.show()

### When the visualization above is analyze, trestbps, chol, oldpeak generally increase in proportion with age, while thalach decreases with age. 
### When trestbps and chol values are examined, they increase and decrease in accordance with each other. In here, we can say that there is a correct ratio between chol and trestbps values.