# This code creates fake data based upon some requirements that are listed at the bottom of the notebook.

In [2]:
# install these packages only if needed. If not needed keep them commented!
# !pip install faker

In [3]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta

# Initialize Faker
fake = Faker()

# Parameters
num_records = 10000  # Number of records to generate

### Generate Transaction Data (and view some rows of the data)

In [5]:
# Generate Transaction Data
transaction_data = []
for _ in range(num_records):
    transaction_id = fake.uuid4()
    timestamp = fake.date_time_this_year()
    amount = round(random.uniform(5.0, 500.0), 2)  # Amount between $5 and $500
    payment_method = random.choice(['Credit Card', 'Cash', 'Debit Card', 'PayPal'])
    store_location = random.choice(['Online', 'Store A', 'Store B', 'Store C'])
    
    transaction_data.append([transaction_id, timestamp, amount, payment_method, store_location])

In [6]:
# display(transaction_data[:10])
df_test_transaction = pd.DataFrame(transaction_data[:15], columns=['transaction_id', 'timestamp', 'amount', 'payment_method','store_location'])
display(df_test_transaction)

Unnamed: 0,transaction_id,timestamp,amount,payment_method,store_location
0,81e3cbd5-1987-48fe-a557-c725bcfb490c,2025-01-01 21:19:25,280.03,Debit Card,Store C
1,da6150ac-9861-4314-8976-f0e51f81f245,2025-01-04 08:24:16,456.83,Debit Card,Store B
2,c9c87273-ab75-461e-a106-a623fc7afd87,2025-01-01 16:49:31,107.43,PayPal,Store B
3,39878eff-b3dd-4914-bbc7-23fe2ef607ec,2025-01-05 09:49:14,118.59,Debit Card,Store C
4,22a33b68-f6c8-46ae-9ca2-2b12ddddbd64,2025-01-03 23:21:30,327.2,Debit Card,Store C
5,418bab99-51c2-4095-8e95-521a45c555f5,2025-01-05 13:40:50,94.84,Debit Card,Store B
6,91540fa9-f0d3-42ed-a9c9-72d5c44201c3,2025-01-05 18:48:41,238.52,Debit Card,Online
7,d230b999-d5c5-4262-863c-4ec2fe6390c3,2025-01-04 22:45:32,428.15,PayPal,Store C
8,47c2d617-8764-42bf-8f0b-258aa3c10738,2025-01-01 22:53:40,314.62,Credit Card,Store A
9,b70c68a0-57c4-4574-b7ae-48d5801aa155,2025-01-01 08:28:21,101.14,Cash,Store A


### Generate Customer Data (and view some rows of the data)

In [8]:
# Generate Customer Data
customer_data = []
for _ in range(num_records):
    customer_id = fake.uuid4()
    age = random.randint(18, 70)  # Age between 18 and 70
    gender = random.choice(['Male', 'Female'])
    income_level = random.choice(['Low', 'Medium', 'High'])
    
    purchase_history = random.sample(range(1, 101), random.randint(1, 10))  # Random previous transactions
    
    customer_data.append([customer_id, age, gender, income_level, purchase_history])

In [9]:
# display(transaction_data[:10])
df_test_customer = pd.DataFrame(customer_data[:15], columns=['customer_id', 'age', 'gender', 'income_level','purchase_history'])
display(df_test_customer)

Unnamed: 0,customer_id,age,gender,income_level,purchase_history
0,3661fff4-ad69-4bc8-9611-45bf7d2844b1,54,Female,Medium,"[82, 9, 10, 72, 7]"
1,5c698b1a-850b-4b6c-9555-a55962d24549,65,Female,Medium,"[58, 73, 21, 78, 12, 91]"
2,b86a4b42-d8b3-4115-9095-53c3b27e2198,62,Female,Medium,"[18, 5, 12, 93, 71, 79, 53, 57]"
3,d1f4583c-d5f9-4d64-95a8-88b6e5177e42,53,Male,High,"[32, 58, 1, 3, 76, 27, 34, 16, 55]"
4,1bc14837-792f-4461-9476-d1cb6dd5978c,43,Male,High,"[84, 24, 19, 54, 48, 96]"
5,fcdaaa8b-d09f-4cee-85f0-4b5b7fd8d33e,25,Male,High,"[36, 58, 55, 18, 44, 4, 12, 21, 31, 82]"
6,0ecd23d9-7d50-4a3a-aa88-d28aa91b93e8,65,Male,High,"[44, 96, 43, 48, 69]"
7,07de887d-3ce1-4116-853f-cebe474f87a2,25,Female,High,"[30, 52, 78, 65, 16, 3, 76, 63, 69]"
8,65d58bef-f764-4438-8d8a-cd69ffb25ffc,45,Female,Low,"[13, 19, 4, 98]"
9,b31e391c-a5b3-4d89-b7a4-29537002baa3,33,Male,Medium,"[96, 64, 56, 72, 74, 85]"


### Generate Product Data (and view some rpws pf the data)

In [11]:
# Generate Product Data
product_data = []
for _ in range(num_records):
    product_id = fake.uuid4()
    category = random.choice(['Electronics', 'Groceries', 'Clothing', 'Home', 'Toys'])
    price = round(random.uniform(1.0, 100.0), 2)  # Price between $1 and $100
    discount = random.choice([0, round(random.uniform(5, 50), 2)])  # Discount between $0 and $50
    
    product_data.append([product_id, category, price, discount])

In [12]:
# display(product_data[:10])
df_test_product = pd.DataFrame(product_data[:15], columns=['product_id', 'category', 'price', 'discount'])
display(df_test_product)

Unnamed: 0,product_id,category,price,discount
0,e64e85cd-1206-4680-84fd-37bf047b2d72,Home,29.51,9.96
1,1ae96db1-87c9-4ae7-9a76-32d116bb3c2e,Electronics,52.74,0.0
2,a617ef7f-dd9b-4898-8a04-e803362096fb,Groceries,60.79,0.0
3,8a552913-7a30-42e4-806d-52ab97df7428,Toys,11.82,25.66
4,89f40f91-9509-41c2-8c8f-8caec776b3e0,Electronics,8.27,16.3
5,9564da5a-9c62-4956-8416-beac6c4c331c,Electronics,67.9,0.0
6,0e166dac-9985-479b-bce8-2c9ea96177ff,Electronics,55.62,41.26
7,5e0de4a3-a7f8-46ee-adfb-aaacc51c73f4,Clothing,35.73,0.0
8,802e19b3-2d42-49df-95fe-6a3408936525,Clothing,79.04,17.38
9,1565d5b5-e346-4fe4-a22e-939c94a515f5,Groceries,92.53,0.0


### Generate Behavioral Data (and view some rows of the data)

In [14]:
# Generate Behavioral Data
behavioral_data = []
for _ in range(num_records):
    browsing_history = random.sample(['page1', 'page2', 'page3', 'page4', 'page5'], random.randint(1, 3))
    cart_abandonment_rate = random.randint(0, 5)  # Number of abandoned carts
    
    behavioral_data.append([browsing_history, cart_abandonment_rate])

In [15]:
# display(behavioral_data[:10])
df_test_behavioral = pd.DataFrame(behavioral_data[:15], columns=['browsing_history', 'cart_abandonment_rate'])
display(df_test_behavioral)

Unnamed: 0,browsing_history,cart_abandonment_rate
0,"[page2, page5]",2
1,[page2],0
2,[page4],2
3,"[page3, page4]",3
4,[page3],4
5,[page3],4
6,"[page1, page3]",0
7,"[page1, page5, page3]",1
8,"[page2, page1, page3]",0
9,"[page4, page2]",0


### Generate Anomalies and Flags (and view some rows of the data)

In [17]:
# Generate Anomalies and Flags
anomalies_data = []
for _ in range(num_records):
    fraud_flag = random.choice([0, 1])  # 0 = Not Fraudulent, 1 = Fraudulent
    high_risk_indicator = random.choice([0, 1])  # 0 = Low Risk, 1 = High Risk
    
    anomalies_data.append([fraud_flag, high_risk_indicator])

In [18]:
# display(anomalies_data[:10])
df_test_anomalies = pd.DataFrame(anomalies_data[:15], columns=['fraud_flag', 'high_risk_indicator'])
display(df_test_anomalies)

Unnamed: 0,fraud_flag,high_risk_indicator
0,0,1
1,1,0
2,1,1
3,1,1
4,1,0
5,0,0
6,1,1
7,1,1
8,0,1
9,0,0


### Generate External Data (and view some rows of the data)

In [20]:
# Generate External Data
external_data = []
for _ in range(num_records):
    geolocation = fake.local_latlng()  # Get random lat/lng
    market_trend = random.choice(['Stable', 'Increasing', 'Decreasing'])
    
    external_data.append([geolocation, market_trend])

In [21]:
# display(external_data[:10])
df_test_external = pd.DataFrame(external_data[:15], columns=['geolocation', 'market_trend'])
display(df_test_external)

Unnamed: 0,geolocation,market_trend
0,"(35.74788, -95.36969, Muskogee, US, America/Ch...",Increasing
1,"(38.91817, -78.19444, Front Royal, US, America...",Decreasing
2,"(41.27621, -72.86843, East Haven, US, America/...",Stable
3,"(44.99012, -123.02621, Keizer, US, America/Los...",Stable
4,"(46.32374, -120.00865, Sunnyside, US, America/...",Decreasing
5,"(40.24537, -75.64963, Pottstown, US, America/N...",Stable
6,"(30.5427, -97.54667, Hutto, US, America/Chicago)",Stable
7,"(41.47892, -87.45476, Schererville, US, Americ...",Decreasing
8,"(20.88953, -156.47432, Kahului, US, Pacific/Ho...",Increasing
9,"(35.05266, -78.87836, Fayetteville, US, Americ...",Decreasing


### Combine all datasets into a single dataframe (and view some rows of the dataframe)

In [48]:
# Combine all data into a single DataFrame
data = pd.DataFrame(transaction_data, columns=['Transaction ID', 'Timestamp', 'Amount', 'Payment Method', 'Store Location'])
data[['Customer ID', 'Age', 'Gender', 'Income Level', 'Purchase History']] = pd.DataFrame(customer_data, index=data.index)
data[['Product ID', 'Category', 'Price', 'Discount']] = pd.DataFrame(product_data, index=data.index)
data[['Browsing History', 'Cart Abandonment Rate']] = pd.DataFrame(behavioral_data, index=data.index)
data[['Fraud Flag', 'High-Risk Indicator']] = pd.DataFrame(anomalies_data, index=data.index)
data[['Geolocation', 'Market Trend']] = pd.DataFrame(external_data, index=data_df.index)

# Display the first few rows of the dataset
print(type(data))
display(data.head())

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,Transaction ID,Timestamp,Amount,Payment Method,Store Location,Customer ID,Age,Gender,Income Level,Purchase History,Product ID,Category,Price,Discount,Browsing History,Cart Abandonment Rate,Fraud Flag,High-Risk Indicator,Geolocation,Market Trend
0,81e3cbd5-1987-48fe-a557-c725bcfb490c,2025-01-01 21:19:25,280.03,Debit Card,Store C,3661fff4-ad69-4bc8-9611-45bf7d2844b1,54,Female,Medium,"[82, 9, 10, 72, 7]",e64e85cd-1206-4680-84fd-37bf047b2d72,Home,29.51,9.96,"[page2, page5]",2,0,1,"(35.74788, -95.36969, Muskogee, US, America/Ch...",Increasing
1,da6150ac-9861-4314-8976-f0e51f81f245,2025-01-04 08:24:16,456.83,Debit Card,Store B,5c698b1a-850b-4b6c-9555-a55962d24549,65,Female,Medium,"[58, 73, 21, 78, 12, 91]",1ae96db1-87c9-4ae7-9a76-32d116bb3c2e,Electronics,52.74,0.0,[page2],0,1,0,"(38.91817, -78.19444, Front Royal, US, America...",Decreasing
2,c9c87273-ab75-461e-a106-a623fc7afd87,2025-01-01 16:49:31,107.43,PayPal,Store B,b86a4b42-d8b3-4115-9095-53c3b27e2198,62,Female,Medium,"[18, 5, 12, 93, 71, 79, 53, 57]",a617ef7f-dd9b-4898-8a04-e803362096fb,Groceries,60.79,0.0,[page4],2,1,1,"(41.27621, -72.86843, East Haven, US, America/...",Stable
3,39878eff-b3dd-4914-bbc7-23fe2ef607ec,2025-01-05 09:49:14,118.59,Debit Card,Store C,d1f4583c-d5f9-4d64-95a8-88b6e5177e42,53,Male,High,"[32, 58, 1, 3, 76, 27, 34, 16, 55]",8a552913-7a30-42e4-806d-52ab97df7428,Toys,11.82,25.66,"[page3, page4]",3,1,1,"(44.99012, -123.02621, Keizer, US, America/Los...",Stable
4,22a33b68-f6c8-46ae-9ca2-2b12ddddbd64,2025-01-03 23:21:30,327.2,Debit Card,Store C,1bc14837-792f-4461-9476-d1cb6dd5978c,43,Male,High,"[84, 24, 19, 54, 48, 96]",89f40f91-9509-41c2-8c8f-8caec776b3e0,Electronics,8.27,16.3,[page3],4,1,0,"(46.32374, -120.00865, Sunnyside, US, America/...",Decreasing


### Move the DataFrame as a CSV file into a folder

In [52]:
# save the dataframe to a csv file

import os

csv_filename = 'fake_retail_data.csv'
destination_path = r'C:\Users\\' + r'ramani.natarajan' + r'\Documents\Experiments\Retail_FraudWasteAbuse\Data\\'

csv_filepath = os.path.join(destination_path, csv_filename)
data.to_csv(csv_filepath, index=False)

# print(destination_path + csv_filename)

  data.to_csv(csv_filepath, csv_filename, index=False)


TypeError: "delimiter" must be a 1-character string

# The data fields are defined as:

Transaction Data:

- Transaction ID: Unique identifier for each transaction.
- Timestamp: Date and time of the transaction.
- Amount: Total value of the transaction.
- Payment Method: Credit card, cash, etc.
- Store Location: Physical or online store identifier.
    
Customer Data:

- Customer ID: Unique identifier for each customer.
- Demographic Information: Age, gender, income level, etc.
- Purchase History: Previous transactions and patterns.
    
Product Data:

- Product ID: Unique identifier for each product.
- Category: Type of product (e.g., electronics, groceries).
- Price: Retail price of the product.
Discounts/Promotions: Any applied discounts.
                               
Behavioral Data:

- Browsing History: Pages viewed online before purchase.
- Cart Abandonment Rates: Instances where customers leave items in their cart without purchasing.
                               
Anomalies and Flags:

- Fraud Flags: Historical labels indicating whether a transaction was fraudulent.
- High-Risk Indicators: Metrics that denote high-risk transactions (e.g., unusually large purchases).
                               
External Data:

- Geolocation Data: Customer location at the time of purchase.
- Market Trends: Data on market conditions that might affect purchasing behavior