In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

USERS = ['Alice', 'Bob', 'Charlie', 'David', 'Eve']
TYPES = ['deposit', 'withdrawal', 'transfer']
START_DATE = datetime(2026, 1, 1)

# Required data
TOTAL_ROWS = 1000

raw_data = []

for i in range(TOTAL_ROWS):
    user = random.choice(USERS)
    type = random.choice(TYPES)
    amount = round(random.uniform(10, 1000), 2)

    # Generating random dates within in last 40 days
    days = random.randint(0, 40)
    seconds = random.randint(0, 86400)
    timestamp = START_DATE + timedelta(days=days, seconds=seconds)

    # Generating messy data
    if i % 50 == 0:
        user = np.nan # missing user
    if i % 75 == 0:
        amount = amount * -1 # negative amount
    if i % 100 == 0:
        amount = 999999.99 # outlier amount

    #creating the list of raw data
    raw_data.append([timestamp, user, type, amount])

print(f"Generated {len(raw_data)} rows of data")

# Converting to DataFrame
df = pd.DataFrame(raw_data, columns=['timestamp', 'user', 'type', 'amount'])
df.head()


Generated 1000 rows of data


Unnamed: 0,timestamp,user,type,amount
0,2026-01-10 22:23:04,,withdrawal,999999.99
1,2026-01-19 09:09:08,Eve,withdrawal,892.61
2,2026-01-28 13:17:09,Alice,withdrawal,611.57
3,2026-02-08 06:19:06,David,withdrawal,961.28
4,2026-01-12 09:16:39,Bob,withdrawal,99.2


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   timestamp  1000 non-null   datetime64[ns]
 1   user       980 non-null    object        
 2   type       1000 non-null   object        
 3   amount     1000 non-null   float64       
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 31.4+ KB


In [3]:
df.describe()

Unnamed: 0,timestamp,amount
count,1000,1000.0
mean,2026-01-21 07:27:15.819000064,10491.89051
min,2026-01-01 00:20:47,-989.92
25%,2026-01-11 02:52:27.750000128,238.0275
50%,2026-01-20 14:40:55,502.985
75%,2026-01-31 13:09:22.500000,767.465
max,2026-02-10 23:33:56,999999.99
std,,99499.546816


In [4]:
# Removing rows with missing user
df.dropna(subset=['user'], inplace=True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 980 entries, 1 to 999
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   timestamp  980 non-null    datetime64[ns]
 1   user       980 non-null    object        
 2   type       980 non-null    object        
 3   amount     980 non-null    float64       
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 38.3+ KB


In [6]:
# Fixing the negative amounts by taking absolute value
df['amount'] = df['amount'].abs()

In [7]:
df.describe()

Unnamed: 0,timestamp,amount
count,980,980.0
mean,2026-01-21 07:30:39.610204416,510.104439
min,2026-01-01 00:20:47,11.15
25%,2026-01-11 02:48:40.500000,250.085
50%,2026-01-20 17:03:18.500000,504.675
75%,2026-01-31 13:09:22.500000,766.045
max,2026-02-10 23:33:56,998.49
std,,290.474034


In [8]:
# Group by 'type' and sum the 'amount'
type_summary = df.groupby('type')['amount'].sum()
print(type_summary)

type
deposit       175653.99
transfer      167836.42
withdrawal    156411.94
Name: amount, dtype: float64


In [9]:
# User activity summary
user_summary = df.groupby('user')['amount'].sum().sort_values(ascending=False)
print(user_summary)

user
Charlie    114948.99
Bob        102504.42
David      100188.48
Eve         92941.95
Alice       89318.51
Name: amount, dtype: float64


In [10]:
print(df['amount'].var())

84375.1645828307


In [11]:
# Extracting day of week and hour of day and is weekend
# (0= Monday, 6= Sunday)
df['day_of_week'] = df['timestamp'].dt.dayofweek
# Hour of day (0-23)
df['hour_of_day'] = df['timestamp'].dt.hour
# Is weekend (1 if Saturday or Sunday, else 0)
df['is_weekend'] = df['day_of_week'].map(lambda x:1 if x>=5 else 0)
df.head()

Unnamed: 0,timestamp,user,type,amount,day_of_week,hour_of_day,is_weekend
1,2026-01-19 09:09:08,Eve,withdrawal,892.61,0,9,0
2,2026-01-28 13:17:09,Alice,withdrawal,611.57,2,13,0
3,2026-02-08 06:19:06,David,withdrawal,961.28,6,6,1
4,2026-01-12 09:16:39,Bob,withdrawal,99.2,0,9,0
5,2026-01-10 17:19:15,Eve,transfer,974.38,5,17,1


In [12]:
# Transaction summary by day of week
dow_summary = df.groupby('day_of_week')['amount'].sum().sort_values(ascending=False)

# Mapping the names to numbers for better readability
days = {0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6:'Sunday'}
dow_summary.index = dow_summary.index.map(days)
print(dow_summary)

day_of_week
Monday       74901.78
Sunday       74854.34
Tuesday      74849.05
Thursday     74198.82
Friday       72692.82
Saturday     68535.76
Wednesday    59869.78
Name: amount, dtype: float64
