In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from plotly import tools
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import plotly.figure_factory as ff

In [2]:
init_notebook_mode(connected=True)

# Data Analytics Assignment 1<br>
<br>
<br>


<center>Anina Zimmer</center>
<center>Anna Ruby</center>
<center>Shun-Lung Chang</center>

<br>
<br>
<br>
<br>
<br>
<br>
<center>Date: 20.09.2017</center>

In [3]:
column_names = ["age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", 
                "thalach", "exang", "oldpeak", "slope", "ca", "thal", "num"]

cleveland = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data", names=column_names)
cleveland['source'] = 'cleveland'

hungarian = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/reprocessed.hungarian.data", names=column_names, sep=' ')
hungarian = hungarian[:-1]
hungarian['source'] = 'hungarian'

switzerland = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.switzerland.data", names=column_names)
switzerland['source'] = 'switzerland'

long_beach = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.va.data", names=column_names)
long_beach['source'] = 'long_beach'

In [4]:
full_dat = pd.concat([cleveland, hungarian, switzerland, long_beach], ignore_index=True)
full_dat = full_dat.replace('?', np.NaN)

We downloaded the [Heart Disease data set](http://archive.ics.uci.edu/ml/datasets/heart+Disease) using [Pandas](http://pandas.pydata.org/) from  four sources, cleveland, hungarian, switzerland and long beach. The data set contains 920 rows and 15 columns.

In [5]:
full_dat.shape

(920, 15)

In [6]:
full_dat.tail(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num,source
915,54.0,0.0,4.0,127.0,333,1,1,154.0,0.0,0.0,,,,1.0,long_beach
916,62.0,1.0,1.0,,139,0,1,,,,,,,0.0,long_beach
917,55.0,1.0,4.0,122.0,223,1,1,100.0,0.0,0.0,,,6.0,2.0,long_beach
918,58.0,1.0,4.0,,385,1,2,,,,,,,0.0,long_beach
919,62.0,1.0,2.0,120.0,254,0,2,93.0,1.0,0.0,,,,1.0,long_beach


# Attributes

| Attribute | Description                                                                                                                                                                                                                                   | Type                   |
|:---------:|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------|
|    age    | Age in years                                                                                                                                                                                                                                  | Numerical (Discrete)   |
|    sex    | 1: male<br> 0: female                                                                                                                                                                                                                         | Category               |
|     cp    | Chest pain type<br> 1: typical angina<br> 2: atypical angina<br> 3: non-anginal pain<br> 4: asymptomatic                                                                                                                                      | Category or Ordinal              |
|  trestbps | Resting blood pressure (in mm Hg on admission to the hospital)                                                                                                                                                                                | Numerical (Continuous) |
|    chol   | Serum cholestoral in mg/dl                                                                                                                                                                                                                    | Numerical (Continuous) |
|    fbs    | Fasting blood sugar > 120 mg/d<br> 1: true<br> 0: false                                                                                                                                                                                       | Category               |
|  restecg  | Resting electrocardiographic results<br> 0: normal<br> 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)<br> 2: showing probable or definite left ventricular hypertrophy by Estes' criteria | Category               |
|  thalach  | Maximum heart rate achieved                                                                                                                                                                                                                   | Numerical (Continuous) |
|   exang   | Exercise induced angina<br> 1: yes<br> 0: no                                                                                                                                                                                                  | Category               |
|  oldpeak  | ST depression induced by exercise relative to rest                                                                                                                                                                                            | Numerical (Continuous) |
|   slope   | The slope of the peak exercise ST segment<br> 1: upsloping<br> 2: flat<br> 3: downsloping<br>                                                                                                                                                 | Ordinal               |
|     ca    | Number of major vessels (0-3) colored by flourosopy                                                                                                                                                                                           | Numerical (Discrete)   |
|    thal   | 3: normal<br> 6: fixed defect<br> 7: reversable defect                                                                                                                                                                                        | Category               |
|    num    | Diagnosis of heart disease (angiographic disease status)                                                                                                                                                                                      | Category or Ordinal              |
|    source    | Source of the data set                                                                                                                                                                                      | Category               |

# Choosing Variables
- We chose the following variables for our analysis.
    1. Source (Categorical) 
    2. Age (Numerical Discrete)
    4. Thalach (Numerical Continuous)
    5. Slope (Categorical or Ordinal)
    6. Num (Categorical or Ordinal)
- Visual Encoding Strategies.
    1. Source (Bar Chart)
    2. Num (Bar Chart)
    3. Source vs. Num (Heatmap)
    4. Age vs. Num (Histogram)
    5. Heart Rate vs. Num (Box Plot)
    6. Slope vs. Num (Barplot)

# Count from Each Source

In [6]:
source_counts = pd.value_counts(full_dat['source'])
source_counts

cleveland      303
hungarian      294
long_beach     200
switzerland    123
Name: source, dtype: int64

In [7]:
trace = go.Bar(
    x = source_counts.index.values,
    y = source_counts.values
)

data = [trace]

layout = go.Layout(
    title= 'Count of Each Source',
    autosize=False,
    width=600,
    height=600
)

fig = go.Figure(data=data, layout=layout)

In [8]:
iplot(fig)

## Observations
- The majority of data is from the U.S., implying that the data may experience a bias from higher rates of obesity and heart disease issues.

#  Counts of Each Types of Heart Diseases

In [9]:
full_dat['diseased'] = full_dat['num'].map({0: 'Not diseased', 1: 'Diseased', 2: 'Diseased', 3: 'Diseased', 4: 'Diseased'})

We created a new variable 'diseased' to label 1, 2, 3, 4 in variable num as 'Diseased'.

In [10]:
diseased_counts = pd.value_counts(full_dat['diseased']).to_frame()
diseased_counts.columns = ['Numbers']

In [11]:
figure = ff.create_table(diseased_counts, index=True, height_constant=60)

trace = go.Bar(
    x = diseased_counts.index.values,
    y = diseased_counts['Numbers'].values,
    marker = dict(color='rgb(158,202,225)'),
    xaxis='x2', yaxis='y2'
)

figure['data'].extend(go.Data([trace]))

figure.layout.yaxis.update({'domain': [0, .2]})
figure.layout.yaxis2.update({'domain': [.3, 1]})
figure.layout.yaxis2.update({'title': 'count'})
figure.layout.xaxis2.update({'anchor': 'y2'})

# Update the margins to add a title and see graph x-labels. 
figure.layout.margin.update({'t':40, 'l':50})
figure.layout.update({'title': 'TBD'})

figure.layout.update({'autosize':False})
figure.layout.update({'width':600})
figure.layout.update({'height':600})

In [12]:
iplot(figure)

In [13]:
diseased = full_dat[full_dat['diseased'] == 'Diseased'][['num', 'diseased']]
diseased['type'] = diseased['num'].map({1: 'Type1', 2: 'Type2', 3: 'Type3', 4: 'Type4'})
diseased_type_counts = pd.value_counts(diseased['type']).to_frame()
diseased_type_counts.columns = ['Numbers']
diseased_type_counts = diseased_type_counts.sort_index()

In [14]:
figure = ff.create_table(diseased_type_counts, index=True)

trace = go.Bar(
    x = diseased_type_counts.index.values,
    y = diseased_type_counts['Numbers'].values,
    marker = dict(color='rgb(158,202,225)'),
    xaxis='x2', yaxis='y2'
)

figure['data'].extend(go.Data([trace]))

figure.layout.yaxis.update({'domain': [0, .2]})
figure.layout.yaxis2.update({'domain': [.3, 1]})
figure.layout.yaxis2.update({'title': 'count'})
figure.layout.xaxis2.update({'anchor': 'y2'})

# Update the margins to add a title and see graph x-labels. 
figure.layout.margin.update({'t':30, 'l':50})
figure.layout.update({'title': 'TBD'})

figure.layout.update({'autosize':False})
figure.layout.update({'width':600})
figure.layout.update({'height':600})

In [15]:
iplot(figure)

## Observations
- The majority of people are diseased rather than not, and the most common disease is type 1. 

# The Relationship between Source and Heart Disease

In [16]:
sh_count = full_dat[['num', 'source']].groupby(['num', 'source']).size()

In [17]:
trace = go.Heatmap(z=[sh_count[sh_count.index.get_level_values('num') == 0].values, 
                      sh_count[sh_count.index.get_level_values('num') == 1].values,
                      sh_count[sh_count.index.get_level_values('num') == 2].values,
                      sh_count[sh_count.index.get_level_values('num') == 3].values,
                      sh_count[sh_count.index.get_level_values('num') == 4].values],
                   y=['Not diseased', 'Type 1', 'Type 2', 'Type 3', 'Type 4'],
                   x=['cleveland', 'hungarian', 'long beach', 'switzerland'])

data=[trace]

layout = go.Layout(
    title='TBD',
    margin={'t':50, 'l':100},
    autosize=False,
    width=800,
    height=600
)

figure = go.Figure(data=data, layout=layout)

In [18]:
iplot(figure)

## Observations

# The Relationship between Age and Heart Disease

In [19]:
age_diseased = full_dat[full_dat['diseased'] == 'Diseased']['age']
age_not_diseased = full_dat[full_dat['diseased'] == 'Not diseased']['age']

In [20]:
hist_data = [age_diseased, age_not_diseased]
group_labels = ['Diseased', 'Not diseased']

figure = ff.create_distplot(hist_data, group_labels)

figure.layout.update({'title':'TBD'})
figure.layout.update({'autosize':False})
figure.layout.update({'width':800})
figure.layout.update({'height':600})

In [21]:
iplot(figure)

## Observations
- Clearly, people who are diseased are generally older.
- The peak age of diseased people is 58-59.

# The Relationship between Heart Rate and Heart Disease

In [22]:
thalach_num_0 = go.Box(
    y=full_dat[full_dat['num'] == 0]['thalach'],
    name = 'Not diseased'
)
thalach_num_1= go.Box(
    y=full_dat[full_dat['num'] == 1]['thalach'],
    name = 'Type 1'
)
thalach_num_2 = go.Box(
    y=full_dat[full_dat['num'] == 2]['thalach'],
     name = 'Type 2'
)
thalach_num_3 = go.Box(
    y=full_dat[full_dat['num'] == 3]['thalach'],
     name = 'Type 3'
)
thalach_num_4 = go.Box(
    y=full_dat[full_dat['num'] == 4]['thalach'],
     name = 'Type 4'
)

data = [thalach_num_0, thalach_num_1, thalach_num_2, thalach_num_3, thalach_num_4]

layout = go.Layout(
    yaxis=dict(
        title='Maximum Heart Rate Achieved',
        zeroline=False
    ),
    title='TBD',
    boxmode='group',
    autosize=False,
    width=800,
    height=600
)

figure = go.Figure(data=data, layout=layout)

In [23]:
iplot(figure)

## Observations
- The outlier -9 should be thrown away because it is invalid response.
- The medians of diseased people are notably lower than that of non-diseased people. In addition, the median heart rate is lower as diseased type worsens except for type 4. 

# The Relationship between Slope (Peak Exercise ST Segment) and Heart Disease

In [24]:
non_na_dat = full_dat[full_dat['slope'].notnull()][['slope', 'num']]
non_na_dat['slope'] = non_na_dat['slope'].astype(int)
diseased_slope_counts = non_na_dat.groupby(['slope', 'num']).size()

In [25]:
trace1 = go.Bar(
    x=['Not diseased', 'Type 1', 'Type 2', 'Type 3', 'Type 4'],
    y=diseased_slope_counts[diseased_slope_counts.index.get_level_values('slope') == 1].values,
    name='Slope 1'
)

trace2 = go.Bar(
    x=['Not diseased', 'Type 1', 'Type 2', 'Type 3', 'Type 4'],
    y=diseased_slope_counts[diseased_slope_counts.index.get_level_values('slope') == 2].values,
    name='Slope 2'
)

trace3 = go.Bar(
    x=['Not diseased', 'Type 1', 'Type 2', 'Type 3', 'Type 4'],
    y=diseased_slope_counts[diseased_slope_counts.index.get_level_values('slope') == 3].values,
    name='Slope 3'
)

trace4 = go.Bar(
    x=['Not diseased', 'Type 1', 'Type 2', 'Type 3', 'Type 4'],
    y=diseased_slope_counts[diseased_slope_counts.index.get_level_values('slope') == -9].values,
    name='Slope -9'
)

data = [trace1, trace2, trace3, trace4]

layout = go.Layout(
    title='TBD',
    barmode='group',
    autosize=False,
    width=800,
    height=600
)

figure = go.Figure(data=data, layout=layout)

In [26]:
iplot(figure)

## Observations
- For healthy people, an upward slope is the most common type, but for diseased people, of any diagnosis, a flat slope is more common. 
- Having a downward slope does not indicate heart disease because the count is approximately equal across all diseased types.
- The -9 field should be ignored. 

# Conclusions

- We chose to program in Python and Plotly to create our graphs. This allowed for flexiablity and interative plots.
- We learned when to use different types of graphs to individualize observations, while adding color and shape constrats to highlight specific categorical and ordinal features.