# Drawing Conclusions Using Groupby

Use `winequality_edited.csv`. You should've created this data file in the previous section: *Appending Data (cont.)*.

In [1]:
# Load dataset
import pandas as pd
import numpy as np
%matplotlib inline

df = pd.read_csv('winequality_edited.csv')

### Is a certain type of wine associated with higher quality?

In [2]:
# Find the mean quality of each wine type (red and white) with groupby
df.groupby(['color'])['quality'].mean()

color
red      5.636023
white    5.877909
Name: quality, dtype: float64

### What level of acidity receives the highest average rating?

In [3]:
# View the min, 25%, 50%, 75%, max pH values with Pandas describe
val = df.pH.describe()
val

count    6497.000000
mean        3.218501
std         0.160787
min         2.720000
25%         3.110000
50%         3.210000
75%         3.320000
max         4.010000
Name: pH, dtype: float64

In [4]:
# Bin edges that will be used to "cut" the data into groups
bin_edges = [val[3], val[4], val[5], val[6], val[7]] # Fill in this list with five values you just found

In [5]:
# Labels for the four acidity level groups
bin_names = ['Very High Acid', 'High Acid', 'Low Acid', 'Very Low Acid'] # Name each acidity level category

In [6]:
# Creates acidity_levels column
df['acidity_levels'] = pd.cut(df['pH'], bin_edges, labels=bin_names)

# Checks for successful creation of this column
df.head()

Unnamed: 0,alcohol,chlorides,citric_acid,color,density,fixed_acidity,free_sulfur_dioxide,pH,quality,residual_sugar,sulphates,total_sulfur_dioxide,volatile_acidity,acidity_levels
0,9.4,0.076,0.0,red,0.9978,7.4,11.0,3.51,5,1.9,0.56,34.0,0.7,Very Low Acid
1,9.8,0.098,0.0,red,0.9968,7.8,25.0,3.2,5,2.6,0.68,67.0,0.88,High Acid
2,9.8,0.092,0.04,red,0.997,7.8,15.0,3.26,5,2.3,0.65,54.0,0.76,Low Acid
3,9.8,0.075,0.56,red,0.998,11.2,17.0,3.16,6,1.9,0.58,60.0,0.28,High Acid
4,9.4,0.076,0.0,red,0.9978,7.4,11.0,3.51,5,1.9,0.56,34.0,0.7,Very Low Acid


In [7]:
# Find the mean quality of each acidity level with groupby
df.groupby(['acidity_levels'])['quality'].mean()

acidity_levels
Very High Acid    5.783343
High Acid         5.784540
Low Acid          5.850832
Very Low Acid     5.859593
Name: quality, dtype: float64

In [8]:
# Save changes for the next section
df.to_csv('winequality_edited.csv', index=False)