<a href="https://colab.research.google.com/github/slegro97/confidence-intervals/blob/main/Confidence_Intervals.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries and Data

In [None]:
%cd /content/drive/MyDrive/PythonProjects/Statistics with Python/Inferential Statistics/Confidence Intervals

/content/drive/MyDrive/PythonProjects/Statistics with Python/Inferential Statistics/Confidence Intervals


In [None]:
# Libraries
import pandas as pd
import scipy.stats as st
import math as m
import statsmodels.stats.api as sm
import numpy as np

In [None]:
# Data
df = pd.read_csv('pizza_restaurant.csv')
df.head()

Unnamed: 0,Product Name,Crust,Toppings,Price,Delivery Time,# pizzas the customer ordered before
0,Pepperoni,Cheese_and_Garlic,4,17,26.3,4
1,Hawaiian,Cheese_and_Chili,4,17,27.8,4
2,Calzone,Cheese_and_Garlic,3,20,31.5,7
3,Margherita,Cheese,4,23,20.8,7
4,Calzone,Cheese_and_Garlic,4,19,27.7,8


In [None]:
# Summary Statistics
df.describe()

Unnamed: 0,Toppings,Price,Delivery Time,# pizzas the customer ordered before
count,1000.0,1000.0,1000.0,1000.0
mean,3.965,19.342,25.0611,5.659
std,1.021185,3.345479,2.490397,2.459831
min,1.0,12.0,17.8,0.0
25%,3.0,17.0,23.3,4.0
50%,4.0,19.0,25.1,5.0
75%,5.0,21.0,26.7,7.0
max,7.0,33.0,32.4,15.0


# Standard Error of the Sample Mean

In [None]:
# Using the formula: SD / sqrt(n) -> Price
print(f"Std Err of Sample Mean: {df.Price.std() / m.sqrt(df.Price.count())}")

Std Err of Sample Mean: 0.105793327900337


In [None]:
# Using the function:
print(f'Std Err of Sample Mean: {st.sem(df.Price)}')

Std Err of Sample Mean: 0.105793327900337


# Z-Score and Standardization

In [None]:
# Using the formula: (value - avg)/SD -> Delivery Time
df['Delivery Time Standardized'] = (df['Delivery Time'] - df['Delivery Time'].mean()) / df['Delivery Time'].std()

In [None]:
# Using Sklearn
from sklearn import preprocessing
df['Delivery Time Standardized2'] = preprocessing.scale(df['Delivery Time'])

In [None]:
# Compare the standardization methods
df.head()

Unnamed: 0,Product Name,Crust,Toppings,Price,Delivery Time,# pizzas the customer ordered before,Delivery Time Standardized,Delivery Time Standardized2
0,Pepperoni,Cheese_and_Garlic,4,17,26.3,4,0.497471,0.49772
1,Hawaiian,Cheese_and_Chili,4,17,27.8,4,1.099784,1.100335
2,Calzone,Cheese_and_Garlic,3,20,31.5,7,2.585491,2.586785
3,Margherita,Cheese,4,23,20.8,7,-1.711012,-1.711868
4,Calzone,Cheese_and_Garlic,4,19,27.7,8,1.05963,1.06016


In [None]:
# Values vary slightly, but the difference is negligible. Difference is a result of the SD being calculated in a different way in each method

# Confidence Levels

In [None]:
cl = [0.005, 0.025, 0.05, 0.95, 0.975, 0.995]
for alpha in cl:
  print(f'The corresponding z-score for alpha = {alpha} is {round(st.norm.ppf(alpha), 2)}')

The corresponding z-score for alpha = 0.005 is -2.58
The corresponding z-score for alpha = 0.025 is -1.96
The corresponding z-score for alpha = 0.05 is -1.64
The corresponding z-score for alpha = 0.95 is 1.64
The corresponding z-score for alpha = 0.975 is 1.96
The corresponding z-score for alpha = 0.995 is 2.58


# Confidence Intervals for Large Samples

In [None]:
print(f'The mean Price is {df.Price.mean()}')
CI_price = st.norm.interval(confidence = 0.95,
                            loc = df.Price.mean(),
                            scale = st.sem(df.Price))
print(f'The 95% Confidence Interval for the Population Mean of the Price is: {CI_price}')

The mean Price is 19.342
The 95% Confidence Interval for the Population Mean of the Price is: (19.134648887510703, 19.549351112489294)


In [None]:
# Using chatGPT, create a function that, for each numerical variable with n > 30
# computes the 95% CI for the mean

# Select numerical columns
numerical_columns = df.select_dtypes(include=[np.number])

# Function to calculate 95% confidence intervals
def calculate_ci_95(df):
    ci_dict = {}

    for column in df.columns:
        if df[column].count() > 30:  # Check that n > 30
            mean = df[column].mean()  # Mean
            sem = st.sem(df[column])  # Standard error of the mean
            ci = st.norm.interval(0.95, loc=mean, scale=sem)  # 95% CI
            ci_dict[column] = ci  # Store CI

    return ci_dict

# Apply the function
ci_95_results = calculate_ci_95(numerical_columns)
print(ci_95_results)


{'Toppings': (3.9017074909279676, 4.028292509072032), 'Price': (19.134648887510703, 19.549351112489294), 'Delivery Time': (24.9067464105456, 25.2154535894544), '# pizzas the customer ordered before': (5.5065408812039385, 5.811459118796061)}


# Confidence Intervals for Small Samples

In [None]:
# Take a sample from the data
sample = df.sample(20)
sample.describe()

Unnamed: 0,Toppings,Price,Delivery Time,# pizzas the customer ordered before
count,20.0,20.0,20.0,20.0
mean,4.75,19.5,25.805,6.25
std,1.019546,2.544344,2.495781,3.006572
min,3.0,16.0,19.8,2.0
25%,4.0,18.0,24.95,3.0
50%,5.0,18.5,25.75,6.0
75%,5.0,21.25,27.175,9.0
max,7.0,26.0,31.1,11.0


In [None]:
# Confidence intervals for small sample (n=20) using T-distribution
numerical_columns = sample.select_dtypes(include=[np.number])

# Function to calculate 95% confidence intervals
def calculate_ci_95(df):
    ci_dict = {}

    for column in df.columns:
        if df[column].count() < 30:  # Check that n > 30
            mean = df[column].mean()  # Mean
            sem = st.sem(df[column])  # Standard error of the mean
            ci = st.t.interval(confidence = 0.95,  # 95% CI
                               df = len(sample[column])-1,
                               loc = mean,
                               scale = sem)
            ci_dict[column] = ci  # Store CI

    return ci_dict

# Apply the function
ci_95_results = calculate_ci_95(numerical_columns)
print(ci_95_results)

{'Toppings': (4.272837867057115, 5.227162132942885), 'Price': (18.309210553592383, 20.690789446407617), 'Delivery Time': (24.63693870056247, 26.97306129943752), '# pizzas the customer ordered before': (4.842881107351889, 7.657118892648111)}
