<a href="https://colab.research.google.com/github/rybott/FEFF_War_Games/blob/main/Company_Generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Modules Used

In [2]:
import pandas as pd
import numpy as np
from datetime import timedelta
import random
import duckdb as ddb

# Further Analysis
import matplotlib.pyplot as plt
# import plotly.express as px

# Generating the Sales Data for the Company

In [3]:
def generate_sales_data(num_records, start_date, end_date, regions_df, products_df):
    # Lists to store generated data
    order_dates = []
    shipment_dates = []
    regions = []
    countries = []
    product_types = []
    unit_prices = []
    unit_costs = []
    quantities = []

    # Generate data for each record
    for _ in range(num_records):
        # Randomly select region and corresponding country
        region_row = regions_df.sample().iloc[0]
        regions.append(region_row['StoreID'])
        countries.append(region_row['StoreName'])

        # Randomly select product and corresponding unit price and cost
        product_row = products_df.sample().iloc[0]
        product_types.append(product_row['Product'])
        unit_prices.append(product_row['Unit Price'])
        unit_costs.append(product_row['Unit Cost'])

        # Random order date between start and end date
        random_date = start_date + timedelta(days=np.random.randint(0, (end_date-start_date).days))
        order_dates.append(random_date)
        # Ship date between 1 and 50 days after order date
        shipment_dates.append(random_date + timedelta(days=np.random.randint(1, 51)))

        # Random quantity between 1000 and 10000
        quantities.append(np.random.randint(1000, 10001))

    # Create dataframe
    sales_df = pd.DataFrame({
        'Order_Date': order_dates,
        'Shipment_Date': shipment_dates,
        'StoreID': regions,
        'StoreName': countries,
        'Product_Type': product_types,
        'Unit_Price': unit_prices,
        'Unit_Cost': unit_costs,
        'Quantity': quantities,
        'Revenue': np.array(unit_prices) * np.array(quantities),
        'Total_Cost': np.array(unit_costs) * np.array(quantities)
    })

    sales_df['Total Profit'] = sales_df['Revenue'] - sales_df['Total_Cost']

    return sales_df

# Example usage:
regions_data = {
    'StoreID': [1,2,3,4,5,6,7,8,9,10],
    'StoreName': ["StoreNY","StoreLA","StorePA","StoreKY","StoreWA","StoreDC","StoreFL","StoreAL","StoreTX","StoreMN",]
}
regions_df = pd.DataFrame(regions_data)

products_data = {
    'Product': ["Office Supplies","Vegetables","Fruits","Cosmetics","Cereal","Baby Food","Beverages","Snacks","Clothes","Household","Personal Care","Meat"],
    'Unit Price': [651.21,154.06,9.33,437.2,205.7,255.28,47.45,152.58,109.28,668.27,81.73,421.89],
    'Unit Cost': [524.96,90.93,6.92,263.33,117.11,159.42,31.79,97.44,35.84,502.54,56.67,364.69]
}

products_df = pd.DataFrame(products_data)

start_date = pd.Timestamp('2020-01-01')
end_date = pd.Timestamp('2022-01-01')
num_records = 1000

sales_df = generate_sales_data(num_records, start_date, end_date, regions_df, products_df)
print(sales_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Order_Date     1000 non-null   datetime64[ns]
 1   Shipment_Date  1000 non-null   datetime64[ns]
 2   StoreID        1000 non-null   int64         
 3   StoreName      1000 non-null   object        
 4   Product_Type   1000 non-null   object        
 5   Unit_Price     1000 non-null   float64       
 6   Unit_Cost      1000 non-null   float64       
 7   Quantity       1000 non-null   int64         
 8   Revenue        1000 non-null   float64       
 9   Total_Cost     1000 non-null   float64       
 10  Total Profit   1000 non-null   float64       
dtypes: datetime64[ns](2), float64(5), int64(2), object(2)
memory usage: 86.1+ KB
None


# Create Purchase Order Data

In [4]:
# df = pd.read_excel('Sales_Data_500k.xlsx')
data = sales_df.copy()

df = data.copy()
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Order_Date     1000 non-null   datetime64[ns]
 1   Shipment_Date  1000 non-null   datetime64[ns]
 2   StoreID        1000 non-null   int64         
 3   StoreName      1000 non-null   object        
 4   Product_Type   1000 non-null   object        
 5   Unit_Price     1000 non-null   float64       
 6   Unit_Cost      1000 non-null   float64       
 7   Quantity       1000 non-null   int64         
 8   Revenue        1000 non-null   float64       
 9   Total_Cost     1000 non-null   float64       
 10  Total Profit   1000 non-null   float64       
dtypes: datetime64[ns](2), float64(5), int64(2), object(2)
memory usage: 86.1+ KB
None


### Data Analysis #1
Checking for any errors or irregulairities
I am also Learning SQL Currently

In [None]:
qry1 = '''
SELECT DISTINCT Product_Name
FROM df
'''

qry2='''
SELECT MAX(Order_Date),MIN(Order_Date)
FROM df
'''

qry4 = '''
SELECT Quantity, Count(Quantity) as sold
FROM df
GROUP BY Quantity
ORDER BY Quantity
'''

qry3 = '''
SELECT Revenue, Count(Revenue) as Rev
FROM df
GROUP BY Revenue
ORDER BY Revenue
'''

df3 = ddb.sql(qry4).df()
print(df3)

In [None]:
plt.plot(df3.index, df3["sold"])
plt.show()

## Creating the Purchase Order

In [5]:
# Load your sales data (assuming it's in a CSV file)
sales_data = df
sales_data = sales_data.sort_values(by=['Order_Date'])


# Define product types
product_types = ['Cereal', 'Snacks', 'Beverages', 'Baby Food', 'Meat', 'Fruits', 'Vegetables', 'Personal Care', 'Cosmetics', 'Household', 'Office Supplies', 'Clothes']

# Initialize empty dataframes for purchase orders and inventory
purchase_orders = pd.DataFrame(columns=['OrderID', 'Product', 'Vendor', 'Quantity', 'OrderDate'])
Inventory_dict = {'Cereal':0, 'Snacks':0, 'Beverages':0, 'Baby Food':0, 'Meat':0, 'Fruits':0, 'Vegetables':0, 'Personal Care':0, 'Cosmetics':0, 'Household':0, 'Office Supplies':0, 'Clothes':0}
Vendor_dict = {'Cereal':'Foodco', 'Snacks':'Foodco', 'Beverages':'Foodco', 'Baby Food':'Foodco', 'Meat':'Farmco', 'Fruits':'Farmco', 'Vegetables':'Farmco', 'Personal Care':'Beautyco', 'Cosmetics':'Beautyco', 'Household':'Homeco', 'Office Supplies':'Homeco', 'Clothes':'Fashionco'}

# Purchase Order lists
Product_Ordered = []
Quantity_Purchased = []
Date_Purchased = []
Vendor = []


# Define a function to generate purchase orders

for index, row in sales_data.iterrows():
  x = 0
  product = row['Product_Type']
  quantity_sold = row['Quantity']
  order_date = row['Order_Date']
  Unit_Cost = row['Unit_Cost']

  if product in Inventory_dict.keys():
   if quantity_sold > Inventory_dict[product]:
    purchase_quantity = (quantity_sold - Inventory_dict[product]) * 1.05
    Inventory_dict[product] = (purchase_quantity + Inventory_dict[product]) - quantity_sold
    Product_Ordered.append(product)
    Quantity_Purchased.append(purchase_quantity)
    Date_Purchased.append(order_date)
    Ven = Vendor_dict[product]
    Vendor.append(Ven)
  else:
    Inventory_dict[product] = Inventory_dict[product] - quantity_sold

PO_df = pd.DataFrame(
    {'Vendor': Vendor,
     'Quantity': Quantity_Purchased,
     'Product': Product_Ordered,
     'Order_Date': Date_Purchased,
     'Unit_Cost': Unit_Cost
    })

Qry_PO ='''
    SELECT Vendor,
    Product,
    SUM(Quantity),
    YEAR(Order_Date) AS Year,
    QUARTER(Order_Date) AS Quarter,
    CAST(
      CASE
        WHEN QUARTER(Order_Date) = 1
          THEN CONCAT(YEAR(Order_Date), '-01-01')
        WHEN QUARTER(Order_Date) = 2
          THEN CONCAT(YEAR(Order_Date), '-04-01')
        WHEN QUARTER(Order_Date) = 3
          THEN CONCAT(YEAR(Order_Date), '-07-01')
        WHEN QUARTER(Order_Date) = 4
          THEN CONCAT(YEAR(Order_Date), '-10-01')
      END AS DATE) AS Purchase_Date
    FROM PO_df
    GROUP BY Vendor, Product, YEAR(Order_Date), QUARTER(Order_Date)
    ORDER BY Year, Quarter
'''

df_PO = ddb.sql(Qry_PO).df()

df_PO = df_PO.merge(products_df[['Product','Unit Cost']], on ='Product', how = 'left')

print(df_PO.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 96 entries, 0 to 95
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Vendor         96 non-null     object        
 1   Product        96 non-null     object        
 2   sum(Quantity)  96 non-null     float64       
 3   Year           96 non-null     int64         
 4   Quarter        96 non-null     int64         
 5   Purchase_Date  96 non-null     datetime64[ns]
 6   Unit Cost      96 non-null     float64       
dtypes: datetime64[ns](1), float64(2), int64(2), object(2)
memory usage: 6.0+ KB
None


In [None]:
with pd.ExcelWriter('POs.xlsx') as writer:
    df_PO.to_excel(writer)

# Creating Inventory
Inventory Tbl
- InventoryID (Added after)
- Product
- Date 
    - Date Purchased or Date Sold
    - For POs this is the first of the month every quarter
- Quantity
- Unit Cost
    - Cost to obtain 
    - For POs this is a bulk purchase price
- Unit Price
    - Price sold at


In [6]:
Inv_DEC = pd.DataFrame({"Date" : df['Order_Date'],"Product":df["Product_Type"],"Quantity":df["Quantity"],"Unit_Cost":df["Unit_Cost"]})
Inv_INC = pd.DataFrame({"Date": df_PO['Purchase_Date'],"Product":df_PO['Product'],"Quantity":df_PO['sum(Quantity)'],"Unit_Cost":df_PO["Unit Cost"],"Vendor":df_PO['Vendor']})

df_INV = Inv_DEC.append(Inv_INC,ignore_index=True)

print(df_INV.info())
print(df_INV.head(10))



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1096 entries, 0 to 1095
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       1096 non-null   datetime64[ns]
 1   Product    1096 non-null   object        
 2   Quantity   1096 non-null   float64       
 3   Unit_Cost  1096 non-null   float64       
 4   Vendor     96 non-null     object        
dtypes: datetime64[ns](1), float64(2), object(2)
memory usage: 42.9+ KB
None
        Date          Product  Quantity  Unit_Cost Vendor
0 2020-10-18             Meat    2354.0     364.69    NaN
1 2020-01-10             Meat    4477.0     364.69    NaN
2 2021-01-02        Cosmetics    4151.0     263.33    NaN
3 2020-02-27        Cosmetics    9114.0     263.33    NaN
4 2021-04-07  Office Supplies    4355.0     524.96    NaN
5 2021-11-29           Fruits    9841.0       6.92    NaN
6 2020-01-11             Meat    1805.0     364.69    NaN
7 2021-11-01        Hous

  df_INV = Inv_DEC.append(Inv_INC,ignore_index=True)


# Exporting to HTML

In [None]:
%%shell
jupyter nbconvert --to html /content/Company_Generator.ipynb

[NbConvertApp] Converting notebook /content/Company_Generator.ipynb to html
[NbConvertApp] Writing 614871 bytes to /content/Company_Generator.html




## Fraud in this Cycle

### For Sales
- Missing Inventory (Inventory less than sales because of larceny)
- Excessive Inventory (Inventory greater becasue of fictious sales)
### For POs / Inventory
- Missing Inventory (Inventory less than POs amount due to theft)
### Other 
- Errors in Expected Margin calculations
    - I have set the expected margin of excess inventory to 1.05x sales. So an ML model will be able to monitor quartely levels and determine if that ratio is within an acceptable range

## *Further Considerations*
- LIFO and FIFO Inventory by varying the Unit Cost
- Cost Accounting by varying Unit Price, determining what new price shoudl be based on market factors
     - Add things like market flucuation, maybe natural disasters, continuity
     - Continuity = This is lacking in my current generator because each order is indidepent, which is realistic, as trends in the quantity and frequency of sales are present within product groups as well as YOY

# Trying Expenses

### Finding Expenses as a % of revenue
***Percentages obtained from ChatGPT*** \
For Numbers I make up this is a good benchmark
- COGS (55.2%-38.7%)
- Rent & Utilities (5.5%-5.2%)
- Salaries & Wages (11%-15.5%)
- Marketing & Advertising (5.5%-7.8%)
- Depreciation & Amortization (2.2%-2.6%)
- Shipping & Freight (3.3%-5.2%)
- Packaging (1.1%-2.6%)
- Taxes (5.5%-7.8%)
- Licenses & Fees (1.1%-1.5%)
- Maintenance & Repairs (1.1%-2.6%)
- R&D (2.2%-5.2%)
- Insurance (1.1%-2.6%)
- Interest Expense (1.1%-2.6%)
- Miscellaneous Expenses (1.1%-1.5%)

Profit Margin: 4% (Average for other Retail Stores)

In [None]:
# Total Revenue Per year
qry_Trev = '''
    SELECT YEAR(Shipment_Date) AS Year, SUM(Revenue) AS Revenue
    FROM sales_df
    GROUP BY Year
    Order By Year
'''

# Total Revenue Per Product Per year
qry_rev = '''
    SELECT YEAR(Shipment_Date) AS Year, Product_Type AS Product, SUM(Revenue) AS Revenue
    FROM sales_df
    GROUP BY Year, Product
    Order By Year
'''

df_Trev = ddb.sql(qry_Trev).df()
df_rev = ddb.sql(qry_rev).df()

Rev_dict = dict(zip(df_Trev['Year'],df_Trev['Revenue']))

# Expenses from total Revenue
- Each Expense category will be assigned a % of total revenue that is <= 100% of Revenue LESS random profit margin (0-6% skewed towards 4%)
- Then a random number of transaction equaling the dollar ammount allocated to that expense category will be created
- Some expenses will be have to have continuality, where one year impacts the value of the next and some epenses such as COGS have to be dependent on the sales quanitity as well.

### steps
1. First Take your Total Revenue
2. Then you can decide what expenses you want to use, this will be decided by the scenario that the user chooses.  
3. Each scenario will also include a profit margin, and you'll subtract Total Rev and Profit Margin to get *Exp_Budget*
3. Then you can take *Exp_Budget* and for each category
  *Exp_Total_Year* = Exp_Budget * % allcated
4. Finally, create all of the corresponding expenses for each year within their exp category.


# Scenario 1

In [None]:
Expenses = {
    'COGS': (45,55),
    'Rent': (6,10),
    'Insurance': (2,4),
    'Wages': (15,20),
    'Ads': (6,8)
}

Exp_dict = {}
Exp_list = []


for year in Rev_dict.keys():
  Rev = Rev_dict[year]
  Year = year
  COGS = (random.randrange(Expenses['COGS'][0],Expenses['COGS'][1])/100)*Rev
  Rent = (random.randrange(Expenses['Rent'][0],Expenses['Rent'][1])/100)*Rev
  Insurance = (random.randrange(Expenses['Insurance'][0],Expenses['Insurance'][1])/100)*Rev
  Wages = (random.randrange(Expenses['Wages'][0],Expenses['Wages'][1])/100)*Rev
  Ads = (random.randrange(Expenses['Wages'][0],Expenses['Wages'][1])/100)*Rev
  PM = Rev - (COGS+Rent+Insurance+Wages+Ads)
  Exp_dict = {'Year':Year,'Rev':Rev,'COGS':COGS,'Rent':Rent,'Insurance':Insurance,'Wages':Wages,'Ads':Ads,'Profit_Margin':PM}
  Exp_list.append(Exp_dict)

Exp_df = pd.DataFrame(Exp_list)

print(Exp_df.info())

# Main Dataframes in one place

In [None]:
# Sales Data
sales_df
# Purchase Order Data
df_PO
# Inventory Data
df_INV