# Exploring

- density distribution

<ul>
<li><a href="#hist">Histogram</a></li>
    - shows distribution of values
<li><a href="#counts">Value Counts</a></li>
<li><a href="#scatter">Scatter Plot</a></li>
    - correlation between features
<li><a href="#scattermatrix">Scatter Matrix</a></li>
<li><a href="#corrmatrix">Correlation Matrix</a></li>
<li><a href="#box">Box Plot</a></li>
<li><a href="#density">Density Distribution</a></li>
<li><a href="#groupby">Group by</a></li>
<li><a href="#query">Query</a></li>
<ul>

In [None]:
import pandas as pd
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# import
df = pd.read_csv('data.csv')

<a id='hist'></a>
## Histogram

In [None]:
df.hist(figsize=(8,8));

In [None]:
df['age'].hist();

In [None]:
df['age'].plot(kind='hist');

<a id='counts'></a>
## Value Counts

In [None]:
df['age'].value_counts()

In [None]:
df['age'].value_counts().plot(kind='bar');

In [None]:
df['age'].value_counts().plot(kind='pie', figsize=(8,8));

<a id='scatter'></a>
## Scatter Plot

In [None]:
df.plot(x='age', y='education', kind='scatter');

<a id='scattermatrix'></a>
## Scatter Matrix

In [None]:
pd.plotting.scatter_matrix(df, figsize(15,15));

<a id='corrmatrix'></a>
## Correlation Matrix

In [None]:
correlation = df_reduced.corr()

fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(correlation, annot=True, square=True, cbar_kws={"shrink": 0.82});

<a id='box'></a>
## Box Plot

In [None]:
df['education'].plot(kind='box');

<a id='density'></a>
## Density Distribution

In [None]:
df_reduced.plot(kind='density', subplots=True, sharex=False, figsize=(10,10));
plt.subplots_adjust(hspace = 0.5)

<a id='groupby'></a>
## Group by

In [None]:
df.groupby('education').mean()

In [None]:
df.groupby(['education', 'age']).mean()

In [None]:
df.groupby(['education', 'age'], as_index=False).mean()

In [None]:
df.groupby(['education', 'age'], as_index=False)['grade'].mean()

In [None]:
# Bin edges that will be used to "cut" the data into groups
bin_edges = [2.72, 3.11, 3.21, 3.32, 4.010] # Fill in this list with five values you just found

# Labels for the four acidity level groups
bin_names = [ 'low', 'medium', 'moderately high', 'high'] # Name each acidity level category

# Creates acidity_levels column
df['acidity_levels'] = pd.cut(df['pH'], bin_edges, labels=bin_names)

# Checks for successful creation of this column
df.head()

In [None]:
# Find the mean quality of each acidity level with groupby
df.groupby('acidity_levels')['quality'].mean()

<a id='query'></a>
## Query

In [None]:
df_m = df[df['diagnosis'] == 'M']
df_m = df.query('diagnosis == "M"')

In [None]:
# select samples with residual sugar less than the median
low_sugar = df.query('residual_sugar < 3.0')

# select samples with residual sugar greater than or equal to the median
high_sugar = df.query('residual_sugar >= 3.0')

# get mean quality rating for the low sugar and high sugar groups
low_sugar.mean().quality

In [None]:
high_sugar.mean().quality