<a href="https://colab.research.google.com/github/ravindesigner/Cafe-Data-Set/blob/main/Cafe_Data_Set.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<a href="https://github.com/ravindesigner/Cafe-Data-Set" target="_parent"><img width="200px" src="https://i.ibb.co/42rWk0B/github-logo-png-1.png" alt="open in github" style="margin-right: 425px" border="0"></a>

<div>
    <h1 style="color: #31708F; text-align:center; font-size:50px; font-weight: bolder;">Cafe Data Set</h1><br>
</div>

<div class="alert alert-block alert-info">
    <p style="font-size:16px;">Our project collects key customer details for a <b>Café</b>, such as <b>Customer ID</b>, <b>Age, <b>Gender</b>, <b>Beverage Ordered</b>, <b>Size</b>, <b>Total Amount Spent</b>, <b>Payment Method</b>, <b>Transaction ID</b>, <b>Staff Name</b>, and <b>Staff ID</b>. These insights are crucial for analyzing Coffee Shop operations and customer preferences.
</p>
</div>

***

<div class="alert alert-block alert-info">
        <p style="font-size:16px;">The dataset consists of <b>500 rows</b>, each representing a single transaction in a <b>cafe</b>. The columns in the dataset are:</p>
    <ul style="font-size:14px;">
        <li><b>Customer ID:</b> A unique identifier for each customer.</li><br>
        <li><b>Age:</b> The age of the customer.</li><br>
        <li><b>Gender:</b> The gender of the customer (e.g., Male, Female, Other).</li><br>
        <li><b>Beverage Ordered:</b> The type of beverage ordered (e.g., Coffee, Tea, Juice).</li><br>
        <li><b>Size:</b> The size of the beverage ordered (e.g., Small, Medium, Large).</li><br>
        <li><b>Price:</b> The total amount of money spent in the transaction.</li><br>
        <li><b>Payment Method:</b> The method of payment used (e.g., Cash, Credit Card, Mobile Payment).</li><br>
        <li><b>Transaction ID:</b> A unique identifier for each transaction.</li><br>
        <li><b>Staff Name:</b> The name of the staff member who handled the transaction.</li><br>
        <li><b>Staff ID:</b> A unique identifier for each staff member.</li>
    </ul>
</div>

***

<div class="alert alert-block alert-info">
    <p style="font-size:16px;"><b>Start</b></p>
</div>

In [None]:
#Load necessary packages
import pandas as pd
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
mydata = pd.read_csv('Cafe-Data-Set.csv') #to read our data set

In [None]:
mydata.head(5) #to show head

In [None]:
mydata.tail(5) #to show tail

In [None]:
mydata.columns #show columns

In [None]:
mydata.info() #show info of the data set

In [None]:
mydata.describe() #to describe the data set

---

<div class="alert alert-block alert-info">
    <p style="font-size:16px;"><b>Graph</b></p>
</div>

In [None]:
#import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
%matplotlib inline

mydata.hist(figsize=(10,10)) #size of the histogram
plt.show() #show histogram

In [None]:
mydata.hist(['Age'], figsize=(5,5)) #histogram of age
plt.show() #show histogram

In [None]:
# Create a contingency table to show the frequency distribution of 'Size' across different 'Gender' categories
pd.crosstab(mydata['Gender'], mydata['Size'])

In [None]:
# Creates a cross-tabulation of gender and beverage orders
pd.crosstab(mydata['Gender'],mydata['Beverage Ordered'] )

In [None]:
plt.figure(figsize=(8, 6)) #size of the graph
sns.countplot(x="Gender", hue="Size", data=mydata) # Create the count plot

In [None]:
sns.pairplot(mydata) # Create a pair plot to visualize relationships between all numeric variables in the dataset

---

<div class="alert alert-block alert-info">
    <p style="font-size:16px;"><b>Stander Deviation</b></p>
</div>

In [None]:
mydata['Age'].std() #stander deviation

---

<div class="alert alert-block alert-info">
    <p style="font-size:16px;"><b>Mean</b></p>
</div>

In [None]:
mydata['Age'].mean() #mean

In [None]:
# Create a histogram using histplot
sns.distplot(mydata['Age'])

In [None]:
# Create a histogram of 'Gender' grouped by 'Age' with a specified figure size
mydata.hist(by='Gender', column='Age', figsize=(5, 5))

In [None]:
# Create separate histograms of 'Age' for each 'Gender' group
mydata.hist(by='Gender', column='Age')

In [None]:
# Create a histogram of 'Gender' grouped by 'Size' with a specified figure size
mydata.hist(by='Gender', column='Size', figsize=(10, 5))

In [None]:
plt.figure(figsize=(15, 5))  # Adjust the width and height as needed

# Visualize the distribution of 'Age' across 'Beverage Ordered' categories using a box plot with adjusted width
sns.boxplot(x="Beverage Ordered", y="Age", data=mydata, width=0.5)

---

In [None]:
#Load necessary packages
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
x = mydata.Age #to read age in our data set

In [None]:
#Print the values from 1 to 15 to check
x[1:15]

In [None]:
fig, ax = plt.subplots()
_ = plt.hist(x, color = '#31708F') #create histogram for variable x in green color

---

<div class="alert alert-block alert-info">
    <p style="font-size:16px;"><b>Mean</b></p>
</div>

In [None]:
xbar = x.mean() #find mean
xbar

---

<div class="alert alert-block alert-info">
    <p style="font-size:16px;"><b>Figures</b></p>
</div>

In [None]:
fig, ax = plt.subplots()
plt.axvline(x = x.mean(), color='orange') #show mean
_ = plt.hist(x, color = '#31708F')

---

<div class="alert alert-block alert-info">
    <p style="font-size:16px;"><b>Median</b></p>
</div>

In [None]:
np.median(x) #find median

In [None]:
fig, ax = plt.subplots()
plt.axvline(x = np.mean(x), color='orange') #show mean
plt.axvline(x = np.median(x), color='red') #show median
_ = plt.hist(x, color = '#31708F')

***

<div class="alert alert-block alert-info">
    <p style="font-size:16px;"><b>Variance</b></p>
</div>

In [None]:
x.var() # Calculate the variance of the variable x

In [None]:
x.var()**(1/2) # Calculate the standard deviation of the variable x

---

<div class="alert alert-block alert-info">
    <p style="font-size:16px;"><b>Stander Deviation</b></p>
</div>

In [None]:
sigma = x.std() #find stander deviation
sigma

In [None]:
fig, ax = plt.subplots()
plt.axvline(x = xbar, color='orange') #mean
plt.axvline(x = xbar+(1.5*sigma), color='#31708F') #xbar+(1.5*sigma)
plt.axvline(x = xbar+sigma, color='olivedrab') #sigma
plt.axvline(x = xbar-sigma, color='olivedrab') #sigma
plt.axvline(x = xbar-(1.5*sigma), color='#31708F') #xbar-(1.5*sigma)
_ = plt.hist(x, color = 'lightgray') #x color

***

<div class="alert alert-block alert-info">
    <p style="font-size:16px;"><b>Gaussian Distribution</b></p>
    <p style="font-size:14px;">After Carl Friedrich Gauss. Also known as <b>normal distribution:</b></p>
</div>

In [None]:
x = mydata.Age #to read age in our data set

In [None]:
sns.set_style('ticks') #Set seaborn style to 'ticks'

In [None]:
_ = sns.displot(x, kde=True)

In [None]:
x.mean() #find mean

In [None]:
x.std() #find stander deviation

<div class="alert alert-block alert-info">
    <p style="font-size:16px;"><b>The Central Limit Theorem</b></p>
</div>

In [None]:
# Generate a random sample of 20 elements from array x with replacement
x_sample = np.random.choice(x, size=15, replace=True)
x_sample

In [None]:
# Calculate the mean of the sample 'x_sample'
x_sample.mean()

---

<div class="alert alert-block alert-info">
    <p style="font-size:16px;"><b>Sampling Distributions</b></p>
</div>

In [None]:
def sample_mean_calculator(input_dist, sample_size, n_samples):
    sample_means = []
    for i in range(n_samples):
        sample = np.random.choice(input_dist, size=sample_size, replace=True)
        sample_means.append(sample.mean())
    return sample_means

In [None]:
sample_means = sample_mean_calculator(x, 10, 1000)  # Calculate sample means
sns.displot(data=sample_means, color='green', kde=True)  # Plot the distribution
_ = plt.xlim(21, 48)  # Set the x-axis limits

In [None]:
sns.displot(sample_mean_calculator(x, 10, 100), color='green', kde=True)
_ = plt.xlim(0, 50)  # Set the x-axis limits

In [None]:
sns.displot(sample_mean_calculator(x, 100, 1000), color='green', kde=True)
_ = plt.xlim(0, 50)  # Set the x-axis limits

---

<div class="alert alert-block alert-info">
    <p style="font-size:16px;"><b>Sampling from a skewed distribution</b></p>
</div>

In [None]:
s = st.skewnorm.rvs(10, size=10000)

In [None]:
_ = sns.displot(s, kde=True)

In [None]:
_ = sns.displot(sample_mean_calculator(s, 10, 1000), color='green', kde=True)

In [None]:
sns.displot(sample_mean_calculator(x, 20, 500), color='red', kde=True)  # Create a distribution plot of sample means using seaborn, with a red color and KDE overlay
_ = plt.xlim(20, 45)  # Set the x-axis limits of the plot from 20 to 50

---

<div class="alert alert-block alert-info">
    <p style="font-size:16px;"><b>Sampling from a multimodal distribution</b></p>
</div>

In [None]:
# Generate random samples from a skewed normal distribution
s = st.skewnorm.rvs(10, size=10000)

# Create a distribution plot with KDE using seaborn
_ = sns.displot(s, kde=True)

In [None]:
# Concatenate two normal distributions and plot their distribution with KDE using seaborn
m = np.concatenate((np.random.normal(size=5000), np.random.normal(loc=4.0, size=1000)))
_ = sns.displot(m, kde=True)

In [None]:
# Concatenate three normal distributions and plot their distribution with KDE using seaborn
m = np.concatenate((np.random.normal(size=5000), np.random.normal(loc=4.0, size=2000), np.random.normal(loc=8.0, size=1000)))
_ = sns.displot(m, kde=True)

In [None]:
_ = sns.displot(sample_mean_calculator(m, 1000, 1000), color='#31708F', kde=True)

---

<div class="alert alert-block alert-info">
    <p style="font-size:16px;"><b>Sampling from Uniform</b></p>
</div>

In [None]:
# Assign the 'Age' column of the DataFrame 'mydata' to variable 'u'
u = mydata.Age

In [None]:
# Plot the distribution of 'u' using seaborn
_ = sns.displot(u)

In [None]:
# Plot the distribution of sample means from 'u' using seaborn with KDE in green
_ = sns.displot(sample_mean_calculator(u, 100, 1000), color='#31708F', kde=True)

***

<div class="alert alert-block alert-info">
    <p style="font-size:16px;"><b>Z-Scores</b></p>
</div>

In [None]:
# Assign the 'Age' column of the DataFrame 'mydata' to variable 'x'
x = mydata.Age

In [None]:
# Assign values to variables: x_i = 20, mu = 33.5, sigma = 5
x_i = 42
mu = x.mean()
sigma = x.std()

In [None]:
sns.displot(x, color='gray') # Plot a distribution plot of the variable x in gray color
ax.set_xlim(0, 100) # Set the x-axis limit of the plot
plt.axvline(mu, color='orange') # Add a vertical line at the mean (mu) in orange color
# Add vertical lines at mu ± 1, and 2 standard deviations (sigma) in olivedrab color
for v in [-2, -1, 1, 2]:
    plt.axvline(mu + v * sigma, color='olivedrab')
_ = plt.axvline(x_i, color='#31708F') # Add a vertical line at x_i (a specific value) in color '#31708F'

***

In [None]:
# Calculate the z-score using the formula: z = (x_i - mu) / sigma
z = (x_i - mu) / sigma
z

In [None]:
# Calculate the z-score using the formula: z = (x_i - mean(x)) / std(x)
z = (x_i - np.mean(x)) / np.std(x)
z

In [None]:
# Count the number of elements in array 'x' that are greater than the threshold x_i
count = len(np.where(x > x_i)[0])

In [None]:
# Calculate the percentage using the formula: percentage = (count_greater_than_xi * 100) / total elements in x
(count * 100) / len(x)

In [None]:
# Calculate the 99th percentile of array 'x'
np.percentile(x, 99)

In [None]:
# Assign values to variables: mu = 90, sigma = 2
mu = 90
sigma = 2

In [None]:
# Generate 10,000 random samples from a normal distribution with mean 'mu' and standard deviation 'sigma'
y = np.random.normal(mu, sigma, 100)

In [None]:
sns.displot(y, color='gray')  # Plot a distribution plot of the variable y in gray color
plt.axvline(mu, color='orange')  # Add a vertical line at the mean (mu) in orange color
for v in [-2, -1, 1, 2]: # Add vertical lines at mu ± 1 and 2 standard deviations (sigma) in olivedrab color
    plt.axvline(mu + v * sigma, color='olivedrab')
_ = plt.axvline(mu, color='#31708F')  # Add a vertical line at mu (a specific value) in purple color

***

<div class="alert alert-block alert-info">
    <p style="font-size:16px;"><b>P-Values</b></p>
</div>

In [None]:
# Calculate the cumulative probability of a standard normal distribution being below -2.5 standard deviations
p_below = st.norm.cdf(-1.5)
p_below

In [None]:
# Convert the cumulative probability to a count equivalent to 10,000 samples
p_below * 500

In [None]:
# Plot the distribution of 'y' using seaborn with gray color
sns.displot(y, color='gray')

# Add a vertical line at the value mu - 2.5 * sigma with red color using matplotlib
_ = plt.axvline(mu - 1.5 * sigma, color='#31708F')

In [None]:
# Calculate the cumulative probability of a standard normal distribution being below 2.5 standard deviations
st.norm.cdf(1.5)

In [None]:
# Calculate the cumulative probability of a standard normal distribution being above 2.5 standard deviations
p_above = 1 - st.norm.cdf(1.5)
p_above

In [None]:
# Convert the cumulative probability to a count equivalent to 10,000 samples
p_above * 500

In [None]:
# Plot the distribution of 'y' using seaborn with gray color
sns.displot(y, color='gray')

# Add a vertical line at the value mu + 2.5 * sigma with blue color using matplotlib
_ = plt.axvline(mu + 1.5 * sigma, color='#31708F')

In [None]:
# Calculate the cumulative probability of a standard normal distribution being outside of 2.5 standard deviations
p_outside = p_below + p_above
p_outside

In [None]:
# Convert the cumulative probability to a count equivalent to 10,000 samples
p_outside * 500

In [None]:
# Plot the distribution of 'y' using seaborn with gray color
sns.displot(y, color='gray')

# Add vertical lines at mu + 2.5 * sigma and mu - 2.5 * sigma with blue color using matplotlib
plt.axvline(mu + 1.5 * sigma, color='#31708F')
_ = plt.axvline(mu - 1.5 * sigma, color='#31708F')

In [None]:
# Calculate the z-score corresponding to the 2.5th percentile of a standard normal distribution
st.norm.ppf(.025)

# Calculate the z-score corresponding to the 97.5th percentile of a standard normal distribution
st.norm.ppf(.975)

In [None]:
# Plot the distribution of 'y' using seaborn with gray color
sns.displot(y, color='gray')

# Add vertical lines at mu + 1.96 * sigma and mu - 1.96 * sigma with dark red color using matplotlib
plt.axvline(mu + 1.5 * sigma, color='#31708F')
_ = plt.axvline(mu - 1.5 * sigma, color='darkred')

In [None]:
# Calculate the z-score corresponding to the 0.8th percentile of a standard normal distribution
st.norm.ppf(.008)

# Calculate the z-score corresponding to the 99.2th percentile of a standard normal distribution
st.norm.ppf(.992)

In [None]:
# Plot the distribution of 'y' using seaborn with gray color
sns.displot(y, color='gray')

# Add vertical lines at mu + 1.96 * sigma and mu - 1.96 * sigma with green color using matplotlib
plt.axvline(mu + 1.96 * sigma, color='green')
plt.axvline(mu - 1.96 * sigma, color='green')

# Add vertical lines at mu + 2.56 * sigma and mu - 2.56 * sigma with red color using matplotlib
plt.axvline(mu + 2.56 * sigma, color='#31708F')
_ = plt.axvline(mu - 2.56 * sigma, color='#31708F')

---

<div class="alert alert-block alert-info">
    <p style="font-size:16px;"><b>Penguins</b></p>
</div>

In [None]:
penguins = sns.load_dataset('penguins').dropna() # some rows are missing data

In [None]:
penguins

In [None]:
# Get unique species and their counts from the penguins dataset
np.unique(penguins.species, return_counts=True)

In [None]:
Chinstrap = penguins[penguins.species == 'Chinstrap']

In [None]:
Chinstrap

In [None]:
np.unique(Chinstrap.island, return_counts=True)

In [None]:
np.unique(Chinstrap.sex, return_counts=True)

In [None]:
_ = sns.boxplot(x='island', y='body_mass_g', hue='sex', data=Chinstrap)

In [None]:
fem = Chinstrap[Chinstrap.sex == 'Female']['body_mass_g'].to_numpy()/1000
fem

In [None]:
mal = Chinstrap[Chinstrap.sex == 'Male']['body_mass_g'].to_numpy()/1000
mal

In [None]:
fbar = f.mean()
fbar

In [None]:
mbar = m.mean()
mbar