In [2]:
# preamble imports
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import statistics

# Assignment 1 - Descriptive Statistics

#### Patrick Pfenning
#### 01/11/23

Basic Python
---

In [3]:
"""Using the statistics module"""
# Set data
data = [1, 2, 2, 3, 3, 3, 4, 4, 4, 4]
# Find mean
statistics.mean(data)

3

In [4]:
# Find median
statistics.median(data)

3.0

In [5]:
# Find mode
statistics.mode(data)

4

In [6]:
# Summarize data
from dataclasses import dataclass, field

"""Python 3.10 has no describe function, so I made my own class to do the same."""

@dataclass
class DescribeResults:
    """Class for describing data"""
    data: list = field(repr=False, default_factory=list)
    # initialize calculations
    mean: float = field(init=False)
    median: float = field(init=False)
    mode: float = field(init=False)
    variance: float = field(init=False)
    stdev: float = field(init=False)
    minmax: tuple = field(init=False)
    sum: float = field(init=False)

    def __post_init__(self):
        """Set Calculations"""
        self.mean = statistics.mean(self.data)
        self.median = statistics.median(self.data)
        self.mode = statistics.mode(self.data)
        self.variance = statistics.variance(self.data)
        self.stdev = statistics.stdev(self.data)
        self.minmax = (min(data), max(data))
        self.sum = sum(data)

DescribeResults(data)

DescribeResults(mean=3, median=3.0, mode=4, variance=1.1111111111111112, stdev=1.0540925533894598, minmax=(1, 4), sum=30)

In [7]:
"""Using the numpy module"""
# mean
np.mean(data)

3.0

In [8]:
# median
np.median(data)

3.0

In [9]:
# std. deviation
np.std(data)

1.0

Data Preparation
---

Before telling our story, we must first clean our data.

In [10]:
# get data
df = pd.read_csv('../data/wine.csv').set_index('unique_id', drop=True)
# transpose for better visibility
df.T.iloc[:, :5]

unique_id,593,617,782,990,822
class,1.0,1.0,1.0,1.0,1.0
alcohol_percentage,14.23,13.2,13.16,14.37,13.24
malic_acid,1.71,1.78,2.36,1.95,2.59
ash,2.43,2.14,2.67,2.5,2.87
alcalinity,15.6,11.2,18.6,16.8,21.0
magnesium,127.0,100.0,101.0,113.0,118.0
phenols,2.8,2.65,2.8,3.85,2.8
flavanoids,3.06,2.76,3.24,3.49,2.69
nonflavanoids,0.28,0.26,0.3,0.24,0.39
proanthocyanins,2.29,1.28,2.81,2.18,1.82


In [11]:
# standardization (maps quantitative values to a bell curve with mean 0)
df_norm = df.iloc[:, 1:-1].copy()
df_norm = (df_norm-df_norm.mean())/df_norm.std()
df_norm[['class', 'price_usd']] = df[['class', 'price_usd']]
df_norm = df_norm[df.columns]
df_norm.T.iloc[:, :5]

unique_id,593,617,782,990,822
class,1.0,1.0,1.0,1.0,1.0
alcohol_percentage,1.514341,0.245597,0.196325,1.686791,0.294868
malic_acid,-0.560668,-0.498009,0.021172,-0.345835,0.227053
ash,0.2314,-0.825667,1.106214,0.486554,1.835226
alcalinity,-1.166303,-2.483841,-0.267982,-0.806975,0.450674
magnesium,1.908522,0.018094,0.08811,0.9283,1.278379
phenols,0.806722,0.567048,0.806722,2.484437,0.806722
flavanoids,1.031908,0.731565,1.212114,1.462399,0.661485
nonflavanoids,-0.657708,-0.818411,-0.497005,-0.979113,0.226158
proanthocyanins,1.221438,-0.543189,2.129959,1.029251,0.400275


In [12]:
for _, grp in df.groupby('class'):
    grp = grp.iloc[:, 1:]
    print(grp.describe())

       alcohol_percentage  malic_acid        ash  alcalinity   magnesium  \
count           59.000000   59.000000  59.000000   59.000000   59.000000   
mean            13.744746    2.010678   2.455593   17.037288  106.338983   
std              0.462125    0.688549   0.227166    2.546322   10.498949   
min             12.850000    1.350000   2.040000   11.200000   89.000000   
25%             13.400000    1.665000   2.295000   16.000000   98.000000   
50%             13.750000    1.770000   2.440000   16.800000  104.000000   
75%             14.100000    1.935000   2.615000   18.700000  114.000000   
max             14.830000    4.040000   3.220000   25.000000  132.000000   

         phenols  flavanoids  nonflavanoids  proanthocyanins      color  \
count  59.000000   59.000000      59.000000        59.000000  59.000000   
mean    2.840169    2.982373       0.290000         1.899322   5.528305   
std     0.338961    0.397494       0.070049         0.412109   1.238573   
min     2.20000