### Import User_defined_Notebook into Gold_Layer

In [0]:
%run "/Workspace/Users/rajesh_1718880886941@npmavericsystems.onmicrosoft.com/Capstone project/User_Defined_Functions"

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

from pyspark.sql.window import Window
from datetime import datetime, timedelta
import pandas as pd
 

In [0]:
# Read the table "cleaned_transformed_cust" from the "silver_schema" database
gold_cust_df = spark.read.table("silver_schema.cleaned_transformed_cust")


In [0]:
# Read the table "cleaned_transformed_branch" from the "silver_schema" database
gold_branch_df = spark.read.table("silver_schema.cleaned_transformed_branch")

In [0]:
# Read the table "cleaned_transformed_trans" from the "silver_schema" database
gold_trans_df = spark.read.table("silver_schema.cleaned_transformed_trans")


## Aggregation

# Creating Segment Table

In [0]:
# Fixed current date
current_date = pd.Timestamp("2023-06-01")

# High_Value: Customers with high transaction volume
high_value_threshold = 10000
high_value_customers = gold_trans_df.groupBy("customer_id").agg(
    sum("amount_usd").alias("total_amount")
)
high_value_customers = (
    high_value_customers.filter(col("total_amount") > high_value_threshold)
    .select("customer_id")
    .withColumn("segment_name", lit("High_Value"))
    .withColumn("segment_description", lit("Customers with high transaction volume"))
    .withColumn("last_update", lit(current_date))
)

# New_User: Customers who joined in the last 30 days
new_user_customers = (
    gold_cust_df.filter(col("join_date") > current_date - timedelta(days=30))
    .select("customer_id")
    .withColumn("segment_name", lit("New_User"))
    .withColumn("segment_description", lit("Customers who joined in last 30 days"))
    .withColumn("last_update", lit(current_date))
)

# Inactive: Customers with no transactions in last 90 days
recent_transactions = (
    gold_trans_df.filter(
        col("transaction_datetime") > current_date - timedelta(days=90)
    )
    .select("customer_id")
    .distinct()
)
inactive_customers = (
    gold_cust_df.join(recent_transactions, on="customer_id", how="left_anti")
    .select("customer_id")
    .withColumn("segment_name", lit("Inactive"))
    .withColumn("segment_description", lit("No transactions in last 90 days"))
    .withColumn("last_update", lit(current_date))
)

# Credit_Risk: Customers with low credit scores (assuming credit_score < 600)
credit_risk_customers = (
    gold_cust_df.filter(col("credit_score") < 600)
    .select("customer_id")
    .withColumn("segment_name", lit("Credit_Risk"))
    .withColumn("segment_description", lit("Customers with low credit scores"))
    .withColumn("last_update", lit(current_date))
)

# Loyal: Customers with consistent activity for over 5 years
loyal_customers = (
    gold_cust_df.filter(col("join_date") < current_date - timedelta(days=5 * 365))
    .select("customer_id")
    .withColumn("segment_name", lit("Loyal"))
    .withColumn("segment_description", lit("Consistent activity for over 5 years"))
    .withColumn("last_update", lit(current_date))
)

# Combine all segments
customer_segmentation_df = (
    high_value_customers.union(new_user_customers)
    .union(inactive_customers)
    .union(credit_risk_customers)
    .union(loyal_customers)
)

# Add segment_id
window_spec = Window.orderBy("customer_id")
customer_segmentation_df = customer_segmentation_df.withColumn("segment_id",
    concat(lit("S"), lpad(row_number().over(window_spec).cast("string"), 4, "0"))
)
 
# Show the result
customer_segmentation_df = customer_segmentation_df.select(
    "segment_id", "customer_id", "segment_name", "segment_description", "last_update"
)

customer_segmentation_df.display()

segment_id,customer_id,segment_name,segment_description,last_update
S0001,C1000,High_Value,Customers with high transaction volume,2023-06-01T00:00:00Z
S0002,C1000,Loyal,Consistent activity for over 5 years,2023-06-01T00:00:00Z
S0003,C1001,High_Value,Customers with high transaction volume,2023-06-01T00:00:00Z
S0004,C1001,Loyal,Consistent activity for over 5 years,2023-06-01T00:00:00Z
S0005,C1002,High_Value,Customers with high transaction volume,2023-06-01T00:00:00Z
S0006,C1002,Loyal,Consistent activity for over 5 years,2023-06-01T00:00:00Z
S0007,C1003,High_Value,Customers with high transaction volume,2023-06-01T00:00:00Z
S0008,C1003,Loyal,Consistent activity for over 5 years,2023-06-01T00:00:00Z
S0009,C1004,High_Value,Customers with high transaction volume,2023-06-01T00:00:00Z
S0010,C1004,Loyal,Consistent activity for over 5 years,2023-06-01T00:00:00Z


In [0]:
# Analyzing the customer_segmentation DataFrame for statistics like :
# Number of rows, Column names, Distinct counts for each column,Data types, Null count, Duplicate count

customer_segmentation_basic_info=analyze_dataframe(customer_segmentation_df)

[1;30m
Number of rows:[0m 1304
[1;30mNumber of columns:[0m 5
[1;30m
Column names:[0m ['segment_id', 'customer_id', 'segment_name', 'segment_description', 'last_update']
[1;30m
Distinct counts for each column:[0m
segment_id: 1304
customer_id: 846
segment_name: 4
segment_description: 4
last_update: 1
[1;30m
Data types:[0m
segment_id: StringType()
customer_id: StringType()
segment_name: StringType()
segment_description: StringType()
last_update: TimestampType()
[1;30m
Null values count and % Null values:[0m
segment_id: 0 (0.00%)
customer_id: 0 (0.00%)
segment_name: 0 (0.00%)
segment_description: 0 (0.00%)
last_update: 0 (0.00%)
[1;30m
Duplicate Data Details:[0m
No duplicate rows found.


In [0]:
customer_segmentation_df.write.format("delta").mode("overwrite").saveAsTable("gold_schema.cust_segment")

In [0]:
# Group by "segment_name" and aggregate the count of each segment
segment_count_df = customer_segmentation_df.groupBy("segment_name").agg(count("segment_name").alias("count"))
 
segment_count_df.show()

segment_count_df.write.format("delta").mode("overwrite").saveAsTable("gold_schema.agg_segment_count")

+------------+-----+
|segment_name|count|
+------------+-----+
|  High_Value|  780|
|    Inactive|    4|
| Credit_Risk|  276|
|       Loyal|  244|
+------------+-----+



##Fraud Flag

In [0]:
# Function to detect unusual amounts
def detect_unusual_amount(df):
    return df.withColumn(
        "unusual_amount",
        when(col("amount_usd") > 50000, lit(0.75)).otherwise(lit(None))
    )

# Function to detect velocity of transactions
def detect_velocity_check(df):
    monthly_counts = df.groupBy("year", "month", "customer_id") \
        .agg(count("transaction_id").alias("monthly_count"))
    
    df_with_counts = df.join(monthly_counts, on=["month", "year", "customer_id"], how="left")
    
    return df_with_counts.withColumn(
        "velocity_check",
        when(col("monthly_count") > 7, lit(0.60)).otherwise(lit(None))
    )

# Function to detect watchlist matches
def detect_watchlist_match(df):
    return df.withColumn(
        "watchlist_match",
        when(col("amount_usd") > 30000, lit(0.90)).otherwise(lit(None))
    )

# Function to detect pattern anomalies
def detect_pattern_anomaly(df):
    return df.withColumn(
        "pattern_anomaly",
        when(col("amount_usd") > 100000, lit(0.85)).otherwise(lit(None))
    )

# Apply fraud detection functions to the DataFrame
transaction_df = detect_unusual_amount(gold_trans_df)
transaction_df = detect_velocity_check(transaction_df)
transaction_df = detect_watchlist_match(transaction_df)
transaction_df = detect_pattern_anomaly(transaction_df)

# Combine fraud flags into a single column and set the confidence score
fraud_flags_df = transaction_df.select(
    col("transaction_id"),
    col("customer_id"),
    col("month"),
    col("year"),
    when(col("unusual_amount").isNotNull(), lit("unusual_amount"))
    .when(col("velocity_check").isNotNull(), lit("velocity_check"))
    .when(col("watchlist_match").isNotNull(), lit("watchlist_match"))
    .when(col("pattern_anomaly").isNotNull(), lit("pattern_anomaly"))
    .alias("flag_type"),
    when(col("unusual_amount").isNotNull(), col("unusual_amount"))
    .when(col("velocity_check").isNotNull(), col("velocity_check"))
    .when(col("watchlist_match").isNotNull(), col("watchlist_match"))
    .when(col("pattern_anomaly").isNotNull(), col("pattern_anomaly"))
    .alias("confidence_score"),
    col("transaction_datetime")
).filter(col("flag_type").isNotNull())

# Add flag_id to the DataFrame
window_spec = Window.orderBy("transaction_datetime")

fraud_flags_df = fraud_flags_df.withColumn("flag_id",
    concat(lit("F"), lpad(row_number().over(window_spec).cast("string"), 4, "0"))
)


# Show the result
fraud_flags_df = fraud_flags_df.select(
    "flag_id", "customer_id", "transaction_id", "year", "month","flag_type","confidence_score","transaction_datetime")
    
# Display the final DataFrame with fraud flags
fraud_flags_df.display()


flag_id,customer_id,transaction_id,year,month,flag_type,confidence_score,transaction_datetime
F0001,C1102,T5004,2018,Jan,velocity_check,0.6,2018-01-01T05:14:00Z
F0002,C1108,T5010,2018,Jan,velocity_check,0.6,2018-01-01T12:12:00Z
F0003,C1040,T5012,2018,Jan,unusual_amount,0.75,2018-01-01T14:31:00Z
F0004,C1051,T5015,2018,Jan,velocity_check,0.6,2018-01-01T17:06:00Z
F0005,C1034,T5019,2018,Jan,velocity_check,0.6,2018-01-01T20:56:00Z
F0006,C1102,T5021,2018,Jan,unusual_amount,0.75,2018-01-01T23:07:00Z
F0007,C1174,T5022,2018,Jan,watchlist_match,0.9,2018-01-01T23:48:00Z
F0008,C1039,T5035,2018,Jan,watchlist_match,0.9,2018-01-02T12:28:00Z
F0009,C1075,T5042,2018,Jan,velocity_check,0.6,2018-01-02T20:04:00Z
F0010,C1075,T5051,2018,Jan,velocity_check,0.6,2018-01-03T06:25:00Z


In [0]:
fraud_flags_df.write.format("delta").mode("append").saveAsTable("gold_schema.fraud_flags")

In [0]:
# Perform the full join
merged_df = gold_cust_df.join(gold_trans_df, "customer_id", "outer").join(gold_branch_df, "branch_id", "outer")

#Saving Fraud Table
merged_df.write.format("delta").mode("append").saveAsTable("gold_schema.merged_table")

In [0]:
customer_segment_counts = customer_segmentation_df.groupBy("segment_name").count()
customer_segment_counts.display()

segment_name,count
High_Value,780
Inactive,4
Credit_Risk,276
Loyal,244


In [0]:
segment_details = customer_segmentation_df.groupBy("segment_name").agg(
    countDistinct("customer_id").alias("customer_count"),
    first("segment_description").alias("segment_description"),
    first("last_update").alias("last_update")
)


segment_details.display()

segment_name,customer_count,segment_description,last_update
Credit_Risk,276,Customers with low credit scores,2023-06-01T00:00:00Z
High_Value,780,Customers with high transaction volume,2023-06-01T00:00:00Z
Inactive,4,No transactions in last 90 days,2023-06-01T00:00:00Z
Loyal,244,Consistent activity for over 5 years,2023-06-01T00:00:00Z


In [0]:
last_update_by_segment = customer_segmentation_df.groupBy("segment_name").agg(
    max("last_update").alias("last_update")
)
last_update_by_segment.display()

segment_name,last_update
High_Value,2023-06-01T00:00:00Z
Inactive,2023-06-01T00:00:00Z
Credit_Risk,2023-06-01T00:00:00Z
Loyal,2023-06-01T00:00:00Z


In [0]:
segment_transaction_summary = (
    gold_trans_df
    .join(customer_segmentation_df, on="customer_id", how="inner")
    .groupBy("segment_name")
    .agg(
        avg("amount_usd").alias("average_transaction_amount"),
        sum("amount_usd").alias("total_transaction_amount")
    )
)
segment_transaction_summary.display()

segment_name,average_transaction_amount,total_transaction_amount
High_Value,3411.1760992229256,146179129.38000005
Inactive,4404.730227272728,387616.26000000007
Credit_Risk,3192.907850560756,44697517.00000002
Loyal,3031.5554491654707,65569512.809999965


In [0]:
segment_transaction_count = (
    gold_trans_df
    .join(customer_segmentation_df, on="customer_id", how="inner")
    .groupBy("segment_name")
    .agg(
        count("transaction_id").alias("transaction_count")
    )
)
segment_transaction_count.display()

segment_name,transaction_count
High_Value,42853
Inactive,88
Credit_Risk,13999
Loyal,21629


In [0]:
segment_total_transaction_amount = (
    gold_trans_df
    .join(customer_segmentation_df, on="customer_id", how="inner")
    .groupBy("segment_name")
    .agg(
        sum("amount_usd").alias("total_transaction_amount")
    )
)
segment_total_transaction_amount.display()

segment_name,total_transaction_amount
High_Value,146179129.38000005
Inactive,387616.26000000007
Credit_Risk,44697517.00000002
Loyal,65569512.809999965


In [0]:

# Corrected code without the incorrect retention rate calculation
retention_analysis = (
    gold_cust_df
    .join(customer_segmentation_df, on="customer_id", how="inner")
    .groupBy("segment_name")
    .agg(
        countDistinct("customer_id").alias("active_customer_count")
    )
)

retention_analysis.display()

segment_name,active_customer_count
High_Value,780
Loyal,244
Credit_Risk,276
Inactive,4


In [0]:
monthly_summary = (gold_trans_df
                   .groupBy("year","month","customer_id")
                   .agg(
                       count("transaction_id").alias("transaction_count"),
                       round(sum("amount_usd"),2).alias("total_transaction_amount")
                   )).orderBy("year","month", "customer_id")

monthly_summary.display()

year,month,customer_id,transaction_count,total_transaction_amount
2018,Apr,C1000,2,54.52
2018,Apr,C1002,4,205.86
2018,Apr,C1003,4,218.44
2018,Apr,C1004,4,121.5
2018,Apr,C1005,8,368.71
2018,Apr,C1006,6,222.67
2018,Apr,C1007,4,358.35
2018,Apr,C1008,3,55.37
2018,Apr,C1009,3,153.98
2018,Apr,C1010,2,123.22


In [0]:
customer_transaction_counts = (gold_trans_df
                               .groupBy("customer_id")
                               .agg(count("transaction_id").alias("transaction_count")))

# Show the result
customer_transaction_counts.display()

customer_id,transaction_count
C1571,34
C1100,87
C1602,38
C1524,39
C1804,23
C1842,12
C1875,9
C1305,62
C1628,41
C1774,29


In [0]:
# Aggregations by Branch with Amounts
branch_aggregations_df = gold_trans_df.groupBy("branch_id").agg(
    round(sum("Amount_USD"), 2).alias("total_amount"),
    round(avg("Amount_USD"), 2).alias("average_amount"),
    min("Amount_USD").alias("min_amount"),
    max("Amount_USD").alias("max_amount"),
    count("Amount_USD").alias("transaction_count")
)

branch_aggregations_df.display()

branch_id,total_amount,average_amount,min_amount,max_amount,transaction_count
B0014,6551369.85,2089.08,0.65,64817.88,3136
B0004,9845848.31,3110.85,1.08,106988.73,3165
B0010,11126891.83,3528.99,1.28,127709.76,3153
B0006,9829209.14,3197.53,1.08,107443.91,3074
B0012,11493339.92,3557.21,1.08,107061.38,3231
B0013,8583319.53,2719.68,1.0,99092.28,3156
B0001,8589440.57,2618.73,1.0,99922.3,3280
B0008,12184930.22,3939.52,1.28,127516.86,3093
B0007,8920035.27,2826.37,1.08,107575.03,3156
B0003,10147381.45,3135.78,1.0,99961.89,3236


In [0]:
customer_aggregations_df = gold_trans_df.groupBy("customer_id").agg(
    round(sum("Amount_USD"), 2).alias("total_amount"),
    round(avg("Amount_USD"), 2).alias("average_amount"),
    min("Amount_USD").alias("min_amount"),
    max("Amount_USD").alias("max_amount"),
    count("Amount_USD").alias("transaction_count")
)

customer_aggregations_df.display()

customer_id,total_amount,average_amount,min_amount,max_amount,transaction_count
C1571,190560.74,5604.73,2.03,100679.34,34
C1100,128756.78,1479.96,1.08,76238.05,87
C1602,177119.87,4661.05,1.0,73983.77,38
C1524,117369.54,3009.48,1.08,102060.16,39
C1804,997.34,43.36,1.66,225.87,23
C1842,493.21,41.1,1.0,104.36,12
C1875,621.63,69.07,4.69,140.19,9
C1305,312724.29,5043.94,1.47,94885.35,62
C1628,79424.06,1937.17,1.0,77704.38,41
C1774,34000.47,1172.43,1.28,32872.43,29


In [0]:
transaction_with_location_df = gold_trans_df.join(
    gold_branch_df.select("branch_id", "location", "timezone"),
    on="branch_id",
    how="left"
)


# Aggregations by Location with Rounded Amounts
location_aggregations_df = transaction_with_location_df.groupBy("location").agg(
    round(sum("Amount_USD"), 2).alias("total_amount"),
    round(avg("Amount_USD"), 2).alias("average_amount"),
    min("Amount_USD").alias("min_amount"),
    max("Amount_USD").alias("max_amount"),
    count("Amount_USD").alias("transaction_count")
)

location_aggregations_df.display()

location,total_amount,average_amount,min_amount,max_amount,transaction_count
Phoenix,8920035.27,2826.37,1.08,107575.03,3156
Madrid,9845848.31,3110.85,1.08,106988.73,3165
Dallas,10147381.45,3135.78,1.0,99961.89,3236
Philadelphia,11267000.04,3501.24,1.28,127855.51,3218
Los Angeles,9829209.14,3197.53,1.08,107443.91,3074
San Diego,6551369.85,2089.08,0.65,64817.88,3136
London,8583319.53,2719.68,1.0,99092.28,3156
Brisbane,9285745.62,3013.87,1.08,107255.61,3081
Perth,8589440.57,2618.73,1.0,99922.3,3280
Paris,11493339.92,3557.21,1.08,107061.38,3231


In [0]:
timezone_aggregations_df = transaction_with_location_df.groupBy("timezone").agg(
    round(sum("Amount_USD"), 2).alias("total_amount"),
    round(avg("Amount_USD"), 2).alias("average_amount"),
    min("Amount_USD").alias("min_amount"),
    max("Amount_USD").alias("max_amount"),
    count("Amount_USD").alias("transaction_count")
)

timezone_aggregations_df.display()

timezone,total_amount,average_amount,min_amount,max_amount,transaction_count
AWST,8589440.57,2618.73,1.0,99922.3,3280
CST,27230607.17,2836.52,1.0,99961.89,9600
CET,32829137.12,3426.13,1.08,127779.79,9582
MST,8920035.27,2826.37,1.08,107575.03,3156
PST,27507470.82,2937.89,0.65,127709.76,9363
EST,11267000.04,3501.24,1.28,127855.51,3218
GMT,8583319.53,2719.68,1.0,99092.28,3156
AEST,21470675.84,3477.6,1.08,127516.86,6174


In [0]:
# Aggregations by Currency with Amounts
currency_aggregations_df = gold_trans_df.groupBy("currency").agg(
    round(sum("Amount_USD"), 2).alias("total_amount"),
    round(avg("Amount_USD"), 2).alias("average_amount"),
    min("Amount_USD").alias("min_amount"),
    max("Amount_USD").alias("max_amount"),
    count("Amount_USD").alias("transaction_count")
)

currency_aggregations_df.display()


currency,total_amount,average_amount,min_amount,max_amount,transaction_count
GBP,46068770.98,3641.8,1.28,127855.51,12650
EUR,49374178.26,3143.45,1.08,107575.03,15707
AUD,6551369.85,2089.08,0.65,64817.88,3136
USD,44403367.27,2768.98,1.0,99961.89,16036


## Transaction table aggregations

In [0]:
channel_aggregations_df = gold_trans_df.groupBy("channel").agg(
    round(sum("Amount_USD"), 2).alias("total_amount"),
    round(avg("Amount_USD"), 2).alias("average_amount"),
    min("Amount_USD").alias("min_amount"),
    max("Amount_USD").alias("max_amount"),
    count("Amount_USD").alias("transaction_count")
)
display(channel_aggregations_df)

channel,total_amount,average_amount,min_amount,max_amount,transaction_count
MOBILE,45119157.35,3058.1,0.65,127855.51,14754
ATM,46298215.52,3102.89,0.65,126302.98,14921
BRANCH,8757269.39,2990.87,0.65,126878.64,2928
WEB,46223044.1,3096.81,0.65,127728.54,14926


In [0]:
# Aggregations by Transaction Type with Rounded Amounts
transaction_type_aggregations_df = gold_trans_df.groupBy("transaction_type").agg(
    round(sum("Amount_USD"), 2).alias("total_amount"),
    round(avg("Amount_USD"), 2).alias("average_amount"),
    min("Amount_USD").alias("min_amount"),
    max("Amount_USD").alias("max_amount"),
    count("Amount_USD").alias("transaction_count")
)

transaction_type_aggregations_df.display()


transaction_type,total_amount,average_amount,min_amount,max_amount,transaction_count
TRANSFER,19541925.29,3231.14,0.65,127855.51,6048
DEPOSIT,34591608.31,2927.77,0.65,127779.79,11815
WITHDRAWAL,54079917.25,3015.5,0.65,126944.5,17934
PAYMENT,38184235.51,3254.71,0.65,126834.37,11732


In [0]:
# Aggregations by Year and Month with Amounts
year_month_aggregations_df = gold_trans_df.groupBy("year", "month").agg(
    round(sum("Amount_USD"), 2).alias("total_amount"),
    round(avg("Amount_USD"), 2).alias("average_amount"),
    min("Amount_USD").alias("min_amount"),
    max("Amount_USD").alias("max_amount"),
    count("Amount_USD").alias("transaction_count")
)

year_month_aggregations_df.display()


year,month,total_amount,average_amount,min_amount,max_amount,transaction_count
2020,Mar,1821337.74,2718.41,1.0,117937.33,670
2022,Jun,2293374.57,3433.2,1.0,117127.51,668
2023,Feb,1713886.43,2755.44,0.65,118887.65,622
2018,Nov,2548461.72,3747.74,0.65,122115.8,680
2019,Jan,2196071.33,3137.24,1.0,118988.24,700
2021,Apr,2097758.42,3107.79,0.65,127200.03,675
2023,Aug,1829579.61,2624.93,0.65,97873.98,697
2021,Jan,2762511.85,3952.09,0.65,117623.13,699
2019,Jul,2845829.79,4136.38,0.68,122347.81,688
2019,Sep,1845849.77,2746.8,0.65,127709.76,672


In [0]:
# Aggregations by Status with Rounded Amounts
status_aggregations_df = gold_trans_df.groupBy("status").agg(
    round(sum("Amount_USD"), 2).alias("total_amount"),
    round(avg("Amount_USD"), 2).alias("average_amount"),
    min("Amount_USD").alias("min_amount"),
    max("Amount_USD").alias("max_amount"),
    count("Amount_USD").alias("transaction_count")
)

status_aggregations_df.display()


status,total_amount,average_amount,min_amount,max_amount,transaction_count
completed,123686232.63,3064.2,0.65,127779.79,40365
denied,7169038.13,2938.13,0.65,124292.01,2440
pending,15542415.6,3290.1,0.65,127855.51,4724


In [0]:
# Aggregations by Channel and Transaction Type with Rounded Amounts
channel_type_aggregations_df = gold_trans_df.groupBy("channel", "transaction_type").agg(
    round(sum("Amount_USD"), 2).alias("total_amount"),
    round(avg("Amount_USD"), 2).alias("average_amount"),
    min("Amount_USD").alias("min_amount"),
    max("Amount_USD").alias("max_amount"),
    count("Amount_USD").alias("transaction_count")
)

channel_type_aggregations_df.display()


channel,transaction_type,total_amount,average_amount,min_amount,max_amount,transaction_count
BRANCH,TRANSFER,2887416.76,3084.85,0.65,126878.64,936
MOBILE,PAYMENT,11922261.68,3175.04,0.65,126834.37,3755
WEB,TRANSFER,5476856.37,3165.81,0.65,127728.54,1730
ATM,PAYMENT,12787426.75,3409.98,0.65,123043.98,3750
MOBILE,WITHDRAWAL,17091138.53,3037.88,0.65,125260.02,5626
WEB,PAYMENT,11855739.54,3195.62,0.65,119766.75,3710
MOBILE,DEPOSIT,10553652.57,2826.37,0.65,127779.79,3734
WEB,WITHDRAWAL,16844306.67,2931.48,0.65,126944.5,5746
WEB,DEPOSIT,12046141.52,3220.89,0.65,127709.76,3740
BRANCH,PAYMENT,1618807.54,3131.16,1.0,105048.15,517


In [0]:
# Define bins for transaction amount ranges
bins = [0, 50, 100, 500, 1000, 5000, 10000, float('inf')]
labels = ['0-50', '50-100', '100-500', '500-1000', '1000-5000', '5000-10000', '10000+']

# Create a new column for amount range
from pyspark.sql.functions import expr

transaction_frequency_df = gold_trans_df.withColumn(
    "amount_range",
    expr(f"CASE "
         f"WHEN Amount_USD <= {bins[1]} THEN '{labels[0]}' "
         f"WHEN Amount_USD <= {bins[2]} THEN '{labels[1]}' "
         f"WHEN Amount_USD <= {bins[3]} THEN '{labels[2]}' "
         f"WHEN Amount_USD <= {bins[4]} THEN '{labels[3]}' "
         f"WHEN Amount_USD <= {bins[5]} THEN '{labels[4]}' "
         f"WHEN Amount_USD <= {bins[6]} THEN '{labels[5]}' "
         f"ELSE '{labels[6]}' "  # This line replaces the check against float('inf')
         f"END AS amount_range")
).groupBy("amount_range").count().alias("transaction_count") 

transaction_frequency_df.display()


amount_range,count
5000-10000,5
0-50,28161
50-100,13147
100-500,3836
10000+,2380


In [0]:
# Calculate total amount USD
total_amount = gold_trans_df.agg(sum("Amount_USD")).collect()[0][0]

# Calculate percentage of total amount by channel
percentage_df = gold_trans_df.groupBy("channel").agg(
    round(sum("Amount_USD"), 2).alias("total_amount_usd")
).withColumn("percentage_of_total", round((col("total_amount_usd") / total_amount) * 100, 2))

percentage_df.show()


+-------+----------------+-------------------+
|channel|total_amount_usd|percentage_of_total|
+-------+----------------+-------------------+
| MOBILE|   4.511915735E7|              30.82|
|    ATM|   4.629821552E7|              31.62|
| BRANCH|      8757269.39|               5.98|
|    WEB|    4.62230441E7|              31.57|
+-------+----------------+-------------------+



In [0]:
currency_analysis = gold_trans_df.groupBy("currency") \
        .agg(
            sum("Amount_USD").alias("total_amount_by_currency"),
            avg("Amount_USD").alias("avg_amount_by_currency")
        )

currency_analysis.display()


currency,total_amount_by_currency,avg_amount_by_currency
GBP,46068770.97999996,3641.800077470352
EUR,49374178.259999976,3143.4505799961785
AUD,6551369.849999993,2089.0847735969364
USD,44403367.270000026,2768.980248815168
