In [1]:
import pandas as pd

# Create a DataFrame with the fake data
data = {
    'Supplier': ['A', 'B', 'C', 'D'],
    'Supply Capacity': [500, 600, 400, 300],
    'Cost per kg': [20, 15, 18, 22],
    'Carbon Footprint per kg': [4, 3, 5, 2.5],
    'Quality Score': [90, 85, 80, 88],
    'Wage per kg': [6, 5, 4, 5.5]
}

df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
df.to_csv('supplier_data.csv', index=False)

print("CSV file 'supplier_data.csv' created successfully!")

CSV file 'supplier_data.csv' created successfully!


In [24]:
import numpy as np
import pandas as pd
from faker import Faker

# Set random seed for reproducibility
np.random.seed(77)

# Number of suppliers
n_suppliers = 30

# Define the means and standard deviations with units of measurement
means = {
    'Cost per bag (euros)': 150,  # Euros per 60kg bag
    'Wage per day (euros)': 9,  # Euros per day
    'Water usage (liters per bag)': 3600,  # Liters used per 60kg bag
    'Quality of coffee (total cup points)': 80,  # Quality score out of 100
    'Yield (bags per ha)': 43  # Bags (60kg) per hectare
}

std_devs = {
    'Cost per bag (euros)': 150 * 0.10,
    'Wage per day (euros)': 9 * 0.10,
    'Water usage (liters per bag)': 3600 * 0.10,
    'Quality of coffee (total cup points)': 80 * 0.05,
    'Yield (bags per ha)': 43 * 0.10
}

# Define the correlation matrix with water usage negatively correlated with cost per bag
correlations = [
    [1.0, 0.4, -0.3, 0.6, 0.0],  # Cost per bag (euros)
    [0.4, 1.0, 0.3, 0.3, 0.0],  # Wage per day (euros)
    [-0.3, 0.3, 1.0, 0.3, 0.0],  # Water usage (liters per bag)
    [0.6, 0.3, 0.3, 1.0, 0.0],  # Quality of coffee (total cup points)
    [0.0, 0.0, 0.0, 0.0, 1.0]   # Yield (bags per ha)
]

# Convert means and std_devs to arrays
mean_values = np.array(list(means.values()))
std_dev_values = np.array(list(std_devs.values()))

# Create the covariance matrix
covariance_matrix = np.outer(std_dev_values, std_dev_values) * correlations

# Generate the data
correlated_data = np.random.multivariate_normal(mean_values, covariance_matrix, n_suppliers)

# Convert to DataFrame
columns = list(means.keys())
correlated_data_df = pd.DataFrame(correlated_data, columns=columns)

# Clip data to ensure reasonable values
correlated_data_df['Water usage (liters per bag)'] = correlated_data_df['Water usage (liters per bag)'].clip(0, 10000)
correlated_data_df['Quality of coffee (total cup points)'] = correlated_data_df['Quality of coffee (total cup points)'].clip(0, 100)

# Use Faker to generate additional non-correlated data
fake = Faker()

# Generate farm sizes with 80% between 0-5 ha and 20% larger
def generate_farm_size():
    return np.random.uniform(0, 5) if np.random.rand() < 0.8 else np.random.uniform(5, 10)

farm_sizes = [generate_farm_size() for _ in range(n_suppliers)]

additional_data = {
    'Supplier ID': [fake.unique.random_int(min=1000, max=9999) for _ in range(n_suppliers)],
    'Country': ['Brazil'] * n_suppliers,
    'Farm size (ha)': farm_sizes
}

additional_data_df = pd.DataFrame(additional_data)

# Combine the correlated data with additional fake data
final_data_df = pd.concat([additional_data_df, correlated_data_df], axis=1)
final_data_df

Unnamed: 0,Supplier ID,Country,Farm size (ha),Cost per bag (euros),Wage per day (euros),Water usage (liters per bag),Quality of coffee (total cup points),Yield (bags per ha)
0,2215,Brazil,8.661316,139.347218,8.333851,3680.683411,79.240432,40.507042
1,9076,Brazil,4.076465,154.167936,8.407694,2889.035717,80.605521,36.224038
2,7252,Brazil,2.812779,132.432416,9.248791,3709.936647,74.615181,38.48889
3,7044,Brazil,6.195632,160.770117,10.171791,4211.292315,85.698427,42.186605
4,1320,Brazil,6.203762,160.571009,10.839115,3560.954866,81.1559,38.675014
5,7187,Brazil,7.744058,153.531658,9.84469,4234.077032,82.501903,40.719994
6,2969,Brazil,2.545213,177.931658,10.015965,3160.342432,83.160985,44.719786
7,2230,Brazil,4.247926,142.524424,10.192133,4002.812655,78.731798,44.182758
8,9483,Brazil,1.022368,165.491351,8.711219,3522.130037,84.085576,42.275953
9,4180,Brazil,2.919738,152.700233,9.786891,3554.263745,80.426174,43.305123


In [26]:
# Save the file as a CSV

final_data_df.to_csv('coffee_supplier_data.csv', index=False)