# Data Cleaning and Analysis of Various Strains of Coffee using Python

A project for course "Data Exploration using Python" (STA 4243) at the University of Texas at San Antonio (UTSA)

Participants: Robert Hall, Max Moran, Ryan Berberek, Dulce Ximena Cid Sanabria

### Table of Contents:

1. Data and Library Importation
2. Exploratory Data Analysis & Cleaning

In [1]:
import pandas as pd
coffee = pd.read_csv('coffee_ratings.csv')

In [2]:
coffee.head()

Unnamed: 0,total_cup_points,species,owner,country_of_origin,farm_name,lot_number,mill,ico_number,company,altitude,...,color,category_two_defects,expiration,certification_body,certification_address,certification_contact,unit_of_measurement,altitude_low_meters,altitude_high_meters,altitude_mean_meters
0,90.58,Arabica,metad plc,Ethiopia,metad plc,,metad plc,2014/2015,metad agricultural developmet plc,1950-2200,...,Green,0,"April 3rd, 2016",METAD Agricultural Development plc,309fcf77415a3661ae83e027f7e5f05dad786e44,19fef5a731de2db57d16da10287413f5f99bc2dd,m,1950.0,2200.0,2075.0
1,89.92,Arabica,metad plc,Ethiopia,metad plc,,metad plc,2014/2015,metad agricultural developmet plc,1950-2200,...,Green,1,"April 3rd, 2016",METAD Agricultural Development plc,309fcf77415a3661ae83e027f7e5f05dad786e44,19fef5a731de2db57d16da10287413f5f99bc2dd,m,1950.0,2200.0,2075.0
2,89.75,Arabica,grounds for health admin,Guatemala,"san marcos barrancas ""san cristobal cuch",,,,,1600 - 1800 m,...,,0,"May 31st, 2011",Specialty Coffee Association,36d0d00a3724338ba7937c52a378d085f2172daa,0878a7d4b9d35ddbf0fe2ce69a2062cceb45a660,m,1600.0,1800.0,1700.0
3,89.0,Arabica,yidnekachew dabessa,Ethiopia,yidnekachew dabessa coffee plantation,,wolensu,,yidnekachew debessa coffee plantation,1800-2200,...,Green,2,"March 25th, 2016",METAD Agricultural Development plc,309fcf77415a3661ae83e027f7e5f05dad786e44,19fef5a731de2db57d16da10287413f5f99bc2dd,m,1800.0,2200.0,2000.0
4,88.83,Arabica,metad plc,Ethiopia,metad plc,,metad plc,2014/2015,metad agricultural developmet plc,1950-2200,...,Green,2,"April 3rd, 2016",METAD Agricultural Development plc,309fcf77415a3661ae83e027f7e5f05dad786e44,19fef5a731de2db57d16da10287413f5f99bc2dd,m,1950.0,2200.0,2075.0


In [3]:
coffee.dtypes

total_cup_points         float64
species                   object
owner                     object
country_of_origin         object
farm_name                 object
lot_number                object
mill                      object
ico_number                object
company                   object
altitude                  object
region                    object
producer                  object
number_of_bags             int64
bag_weight                object
in_country_partner        object
harvest_year              object
grading_date              object
owner_1                   object
variety                   object
processing_method         object
aroma                    float64
flavor                   float64
aftertaste               float64
acidity                  float64
body                     float64
balance                  float64
uniformity               float64
clean_cup                float64
sweetness                float64
cupper_points            float64
moisture  

In [4]:
coffee.columns

Index(['total_cup_points', 'species', 'owner', 'country_of_origin',
       'farm_name', 'lot_number', 'mill', 'ico_number', 'company', 'altitude',
       'region', 'producer', 'number_of_bags', 'bag_weight',
       'in_country_partner', 'harvest_year', 'grading_date', 'owner_1',
       'variety', 'processing_method', 'aroma', 'flavor', 'aftertaste',
       'acidity', 'body', 'balance', 'uniformity', 'clean_cup', 'sweetness',
       'cupper_points', 'moisture', 'category_one_defects', 'quakers', 'color',
       'category_two_defects', 'expiration', 'certification_body',
       'certification_address', 'certification_contact', 'unit_of_measurement',
       'altitude_low_meters', 'altitude_high_meters', 'altitude_mean_meters'],
      dtype='object')

In [5]:
for i in coffee.columns:
    if pd.api.types.is_numeric_dtype(coffee[str(i)]):
        print(f"Column {str(i)} Maximum: {coffee[str(i)].max()}")
        print(f"Column {str(i)} Minimum: {coffee[str(i)].min()}")
        print('\n')
    else:
        continue


Column total_cup_points Maximum: 90.58
Column total_cup_points Minimum: 0.0


Column number_of_bags Maximum: 1062
Column number_of_bags Minimum: 0


Column aroma Maximum: 8.75
Column aroma Minimum: 0.0


Column flavor Maximum: 8.83
Column flavor Minimum: 0.0


Column aftertaste Maximum: 8.67
Column aftertaste Minimum: 0.0


Column acidity Maximum: 8.75
Column acidity Minimum: 0.0


Column body Maximum: 8.58
Column body Minimum: 0.0


Column balance Maximum: 8.75
Column balance Minimum: 0.0


Column uniformity Maximum: 10.0
Column uniformity Minimum: 0.0


Column clean_cup Maximum: 10.0
Column clean_cup Minimum: 0.0


Column sweetness Maximum: 10.0
Column sweetness Minimum: 0.0


Column cupper_points Maximum: 10.0
Column cupper_points Minimum: 0.0


Column moisture Maximum: 0.28
Column moisture Minimum: 0.0


Column category_one_defects Maximum: 63
Column category_one_defects Minimum: 0


Column quakers Maximum: 11.0
Column quakers Minimum: 0.0


Column category_two_defects Maximum: 55


In [6]:
print(coffee['species'].value_counts())

species
Arabica    1311
Robusta      28
Name: count, dtype: int64


## Quality

### Features:

- Aftertaste 
- Aroma 
- Acidity 
- Body 
- Balance 
- Clean Cup 
- Uniformity 
- Sweetness
- Moisture

### Questions:

* What palate-related variable has the highest correlation with score?
* Are there statistically significant diferences in taste quantifications with respect to diferent countries of origin?
* Is there a correlation between altitude and certain taste quantifications?

In [8]:
coffee.columns

Index(['total_cup_points', 'species', 'owner', 'country_of_origin',
       'farm_name', 'lot_number', 'mill', 'ico_number', 'company', 'altitude',
       'region', 'producer', 'number_of_bags', 'bag_weight',
       'in_country_partner', 'harvest_year', 'grading_date', 'owner_1',
       'variety', 'processing_method', 'aroma', 'flavor', 'aftertaste',
       'acidity', 'body', 'balance', 'uniformity', 'clean_cup', 'sweetness',
       'cupper_points', 'moisture', 'category_one_defects', 'quakers', 'color',
       'category_two_defects', 'expiration', 'certification_body',
       'certification_address', 'certification_contact', 'unit_of_measurement',
       'altitude_low_meters', 'altitude_high_meters', 'altitude_mean_meters'],
      dtype='object')

In [11]:
coffee_salient = coffee[['total_cup_points', 'aftertaste', 'aroma', 'acidity', 'body', 'balance', 'clean_cup', 'uniformity', 'sweetness', 'moisture']]

In [12]:
coffee_salient.head()

Unnamed: 0,total_cup_points,aftertaste,aroma,acidity,body,balance,clean_cup,uniformity,sweetness,moisture
0,90.58,8.67,8.67,8.75,8.5,8.42,10.0,10.0,10.0,0.12
1,89.92,8.5,8.75,8.58,8.42,8.42,10.0,10.0,10.0,0.12
2,89.75,8.42,8.42,8.42,8.33,8.42,10.0,10.0,10.0,0.0
3,89.0,8.42,8.17,8.42,8.5,8.25,10.0,10.0,10.0,0.11
4,88.83,8.25,8.25,8.5,8.42,8.33,10.0,10.0,10.0,0.12


In [16]:
coffee_salient.isnull().sum()

total_cup_points    0
aftertaste          0
aroma               0
acidity             0
body                0
balance             0
clean_cup           0
uniformity          0
sweetness           0
moisture            0
dtype: int64

In [17]:
features = coffee_salient[['aftertaste', 'aroma', 'acidity', 'body', 'balance', 'clean_cup', 'uniformity', 'sweetness', 'moisture']]
labels = coffee_salient[['total_cup_points']]

In [21]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features, labels, train_size=0.8, test_size=0.2)

In [22]:
from sklearn.linear_model import LinearRegression
mlr = LinearRegression()
model = mlr.fit(x_train, y_train)
points_pred = model.predict(x_test)

In [23]:
print(features.columns)


Index(['aftertaste', 'aroma', 'acidity', 'body', 'balance', 'clean_cup',
       'uniformity', 'sweetness', 'moisture'],
      dtype='object')


In [25]:
print(model.coef_)

[[ 1.96699286  1.37423496  1.3754272   1.04205129  1.27401336  1.05337295
   1.00893026  0.94918875 -0.0555578 ]]


In [33]:
coefs = []
for subset in model.coef_:
    for coef in subset:
        coefs.append(round(coef, 4))

print(coefs)

cols = [col for col in features.columns]
print(cols)

feature_coefs = pd.DataFrame({'features': cols,
                              'coefficients': coefs})

feature_coefs.sort_values('coefficients', ascending=False)
feature_coefs

[1.967, 1.3742, 1.3754, 1.0421, 1.274, 1.0534, 1.0089, 0.9492, -0.0556]
['aftertaste', 'aroma', 'acidity', 'body', 'balance', 'clean_cup', 'uniformity', 'sweetness', 'moisture']


Unnamed: 0,features,coefficients
0,aftertaste,1.967
1,aroma,1.3742
2,acidity,1.3754
3,body,1.0421
4,balance,1.274
5,clean_cup,1.0534
6,uniformity,1.0089
7,sweetness,0.9492
8,moisture,-0.0556
