In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Data Science Road Ahead Map - 1

This notebook will be a comprehensive guide who wants to learn data science basics.

We will insvestigate Heart Disease UCI dataset which is well known for machine learning practices.
Dataset consists of 13 attributes:

1. age
2. sex (1 = male; 0 = female)
3. chest pain type (4 values)
4. resting blood pressure
5. serum cholestoral in mg/dl
6. fasting blood sugar > 120 mg/dl (1 = true; 0 = false)
7. resting electrocardiographic results (values 0,1,2)
8. maximum heart rate achieved
9. exercise induced angina (1 = yes; 0 = no)
10. oldpeak = ST depression induced by exercise relative to rest
11. the slope of the peak exercise ST segment
12. number of major vessels (0-3) colored by flourosopy
13. thal: 3 = normal; 6 = fixed defect; 7 = reversable defect

Given the information we can start with reading data.

In [None]:
data = pd.read_csv('../input/heart-disease-uci/heart.csv')

# 1. First,investigate your data!

In [None]:
data.info() #variables and datatypes
data.head(10) # peak to 10 rows
#data.describe() # knowing basic information is always useful


For the further interpretation, it is better we change the categorical variables.

In [None]:
pd.options.mode.chained_assignment = None  # preventing some warning outputs

data['sex'][data['sex'] == 0] = 'female'
data['sex'][data['sex'] == 1] = 'male'

data['cp'][data['cp'] == 1] = 'typical angina'
data['cp'][data['cp'] == 2] = 'atypical angina'
data['cp'][data['cp'] == 3] = 'non-anginal pain'
data['cp'][data['cp'] == 4] = 'asymptomatic'

data['fbs'][data['fbs'] == 0] = 'lower than 120mg/ml'
data['fbs'][data['fbs'] == 1] = 'greater than 120mg/ml'

data['restecg'][data['restecg'] == 0] = 'normal'
data['restecg'][data['restecg'] == 1] = 'ST-T wave abnormality'
data['restecg'][data['restecg'] == 2] = 'left ventricular hypertrophy'

data['exang'][data['exang'] == 0] = 'no'
data['exang'][data['exang'] == 1] = 'yes'

data['slope'][data['slope'] == 1] = 'upsloping'
data['slope'][data['slope'] == 2] = 'flat'
data['slope'][data['slope'] == 3] = 'downsloping'

data['thal'][data['thal'] == 1] = 'normal'
data['thal'][data['thal'] == 2] = 'fixed defect'
data['thal'][data['thal'] == 3] = 'reversable defect'

data['sex'] = data['sex'].astype('object') #we'll need as object data type
data.head(10)

It already looks better!

Let's move to investigate with some visualization.

In [None]:
fig, axes = plt.subplots(1, 2)
data.age.plot(kind='hist', color='b', bins=100, ax=axes[0], title = 'Age')
data.chol.plot(kind='hist', color='b', bins=100, ax=axes[1], title = 'Cholestrol')


In [None]:
# Scatter Plot

a= data['sex']== 'male'
b= data['sex']== 'female'

plt.scatter(data[a]['age'], data[a]['chol'], c='b', label='Male')
plt.scatter(data[b]['age'], data[b]['age'], c='r', marker= '^', label='female')
plt.xlabel('Age')
plt.ylabel('Cholestrol')
plt.title('Age vs Cholestrol')
plt.legend(loc='upper left')
plt.show()

In [None]:
# Line plot
data.age.plot(kind='line', color='b', label='Age', linewidth=1, alpha=.9, grid=True, linestyle=':')
data.chol.plot(color='r', label='Cholestrol', linewidth=1, alpha=.9, grid=True, linestyle='-')
plt.legend(loc='upper right')
plt.xlabel('Age')
plt.ylabel('Cholestrol')
plt.title('Age and Cholestrol')
plt.show()

# 3 - Correlation is a good indicator of relatlions between variables

Especially, heart diseases are related with age and sex.

In [None]:
f,ax=plt.subplots(figsize =(10,10))
sns.heatmap(data.corr(), annot=True, linewidths=.5, fmt= '.1f')


# 4 - Filtering

In [None]:
# Let's check the high cholestrol

highchl = data['chol']>200
data[highchl]

What about the males and females?


In [None]:
data[np.logical_and(data['chol']>250, data['sex']=='male')].head()
data[np.logical_and(data['chol']>250, data['sex']=='female')].head()


In [None]:
# While loop to search for low cholestrol

i=0
while i<150:
    print('chol is:',data.chol[i])
    i=i+1
print(i,'is low cholestrol good!')

In [None]:
# For loop to see index and corresponding values

for index, value in data[['chol']][0:3].iterrows():
    print(index, ':', value)