In [3]:
import os
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from pyspark.sql import SparkSession
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from pyspark.sql.functions import regexp_extract, col

In [4]:
spark = SparkSession\
    .builder\
    .appName("capstone_analysis")\
    .config("spark.jars","/opt/homebrew/Cellar/apache-spark/3.3.1/libexec/jars/mysql-connector-j-8.0.32.jar")\
    .getOrCreate()

23/02/27 15:47:07 WARN Utils: Your hostname, Sules-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.0.17 instead (on interface en0)
23/02/27 15:47:07 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
23/02/27 15:47:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [5]:
mysql_pwd = os.environ.get("mysql_root_p")

In [6]:
# read tables

credit = spark.read \
    .jdbc("jdbc:mysql://localhost:3306/creditcard_capstone","CDW_SAPP_CREDIT_CARD", \
          properties={"user": "root", "password": mysql_pwd, "driver":"com.mysql.cj.jdbc.Driver"})
credit.createTempView("credit_tb")

branch = spark.read \
    .jdbc("jdbc:mysql://localhost:3306/creditcard_capstone","CDW_SAPP_BRANCH", \
          properties={"user": "root", "password": mysql_pwd,  "driver":"com.mysql.cj.jdbc.Driver"})
branch.createTempView("branch_tb")

customer = spark.read \
    .jdbc("jdbc:mysql://localhost:3306/creditcard_capstone","CDW_SAPP_CUSTOMER", \
          properties={"user": "root",  "password": mysql_pwd,"driver":"com.mysql.cj.jdbc.Driver"})
customer.createTempView("customer_tb")

loan = spark.read \
    .jdbc("jdbc:mysql://localhost:3306/creditcard_capstone","CDW_SAPP_loan_application", \
          properties={"user": "root", "password": mysql_pwd, "driver":"com.mysql.cj.jdbc.Driver"})
loan.createOrReplaceTempView("loan_tb")


### Req. 3 Data Analysis and Visualization

1. Find and plot which transaction type has a high rate of transactions.

2. Find and plot which state has a high number of customers.

3. Find and plot the sum of all transactions for each customer, and which customer has the highest transaction amount.

1. Find and plot which transaction type has a high rate of transactions.

All transaction types have similar rate; "Bills" have slightly higher.

In [7]:
pdf1 = spark.sql("Select TRANSACTION_TYPE, count(*) from credit_tb group by TRANSACTION_TYPE").toPandas()
pdf1 = pdf1.rename(columns={"TRANSACTION_TYPE": "Transaction Type", "count(1)": "Rate"})

                                                                                

In [8]:
fig = make_subplots(
    rows=1, cols=2,
    specs=[[{"type": "bar"}, {"type": "table"}]]
)
fig.add_trace(
    go.Bar(
        x=pdf1["Transaction Type"],
        y=pdf1["Rate"],

    ),
    row=1, col=1
)
fig.add_trace(
    go.Table(
        header=dict(
            values=pdf1.columns,
            font=dict(size=14),
            align="left"
        ),
        cells=dict(
            values=[pdf1[k].tolist() for k in pdf1.columns],
            align = "left")
    ),
    row=1, col=2
)
fig.update_layout(
    height=400,
    width=800,
    showlegend=False,
    title_text="Frequency of Credit Transactions",
    xaxis_title="Transaction Type", 
    yaxis_title = "Total Transactions",
    yaxis_range = [6000,7000]
)
fig.show()

2. Find and plot which state has a high number of customers.

* New York State has the highest number of customers

In [9]:
state = spark.sql("Select CUST_STATE AS State, count(*) AS `Number of Customers` from customer_tb group by CUST_STATE ORDER BY `Number of customers` DESC limit 1").collect()
print(f"State with the highest number of customers is {state[0].State}.")

State with the highest number of customers is NY.


In [10]:
pdf2 = spark.sql("Select CUST_STATE AS State, count(*) AS `Number of Customers` from customer_tb group by CUST_STATE ORDER BY `Number of customers` DESC").toPandas()

In [11]:
fig = px.bar(pdf2, x="State", y="Number of Customers", title='Distribution of Number of Customers per State') 
fig.show()

3. Find and plot the sum of all transactions for each customer, and which customer has the highest transaction amount.


In [12]:
highest_tran_value = spark.sql("Select  credit_tb.CUST_SSN, customer_tb.FIRST_NAME, customer_tb.LAST_NAME, ROUND(SUM(credit_tb.TRANSACTION_VALUE),2) as Spending from credit_tb \
    JOIN customer_tb ON credit_tb.CUST_SSN = customer_tb.SSN group by credit_tb.CUST_SSN,customer_tb.FIRST_NAME, customer_tb.LAST_NAME ORDER BY Spending DESC LIMIT 1;").collect()
print(f"Customer with highest transaction amount is {highest_tran_value[0].FIRST_NAME} {highest_tran_value[0].LAST_NAME}.") 

Customer with highest transaction amount is Ty Daly.


In [13]:
pdf3 =spark.sql("Select  credit_tb.CUST_SSN, customer_tb.FIRST_NAME, customer_tb.LAST_NAME, SUM(credit_tb.TRANSACTION_VALUE) as Spending from credit_tb \
    JOIN customer_tb ON credit_tb.CUST_SSN = customer_tb.SSN group by credit_tb.CUST_SSN,customer_tb.FIRST_NAME, customer_tb.LAST_NAME ORDER BY Spending DESC LIMIT 10;").toPandas()

In [14]:
fig = px.bar(pdf3, x="FIRST_NAME", y="Spending", title='Customer Spending (USD) - Top 10',
labels={
    "FIRST_NAME": "First Name"
    },
    #orientation='h' 
) 
fig.update_traces(width=0.5,) 
                  #marker_color = 'rgb(50, 171, 96, 0.01)')

fig.add_annotation(x=0, y=5650,
            text="Ty Daly",
            showarrow=True,
            arrowhead=1)
fig.update_layout(
    yaxis_range = [5000,5800],
    autosize=False,
    width=600,
    height=500,
)
fig.update_layout(barmode='group', xaxis_tickangle=-45)
fig.show()



### Req.5 Data Analysis and Visualization for Loan Application
1.   Find and plot the percentage of applications approved for self-employed applicants.

2.    Find the percentage of rejection for married male applicants.

3.   Find and plot the top three months with the largest transaction data.

4.    Find and plot which branch processed the highest total dollar value of healthcare transactions.

1.   Find and plot the percentage of applications approved for self-employed applicants.

In [15]:
dfl1 = loan.select("Self_Employed","Application_Status")\
                .groupBy("Self_Employed","Application_Status").count().toPandas()
dfl1

Unnamed: 0,Self_Employed,Application_Status,count
0,Yes,Y,46
1,No,Y,301
2,Yes,N,24
3,No,N,140


In [16]:
df_se = dfl1[dfl1["Self_Employed"]=='Yes']
df_se["Percentage"] = 100 * df_se["count"]/df_se["count"].sum()
df_se = df_se.round({"Percentage":1})
df_notse = dfl1[dfl1["Self_Employed"]=='No']
df_notse["Percentage"] = 100 * df_notse["count"]/df_notse["count"].sum()
df_notse = df_notse.round({"Percentage":1})
df_perc = pd.concat([df_se, df_notse],axis=0)

In [17]:
df_perc

Unnamed: 0,Self_Employed,Application_Status,count,Percentage
0,Yes,Y,46,65.7
2,Yes,N,24,34.3
1,No,Y,301,68.3
3,No,N,140,31.7


In [26]:
print(f"The percentage of applications approved for self-employed applicants is {df_perc.loc[0,'Percentage']}")

The percentage of applications approved for self-employed applicants is 65.7


In [27]:
fig = px.bar(df_perc, x="Self_Employed", 
                      y="Percentage", 
                      color="Application_Status", 
                      title="Loan Approval percentagaes for Self-Employed",
                      labels={"Self_Employed":"Selp-Employement Status",
                      "Percentage":"Percentage of Application Status"})
fig.update_traces(width=0.5,) 
                  #marker_color = 'rgb(50, 171, 96, 0.01)')
fig.update_layout(
    yaxis_range = [0, 120],
    autosize=False,
    width=600,
    height=500,
)
fig.show()


In [20]:

fig = px.bar(df_perc, x="Self_Employed", 
                      y="count", 
                      color="Application_Status", 
                      title="Loan Approvals for Self-Employed",
                      labels={"Self_Employed":"Selp-Employement Status",
                      "count":"Count of loan applications"})
fig.update_traces(width=0.5,) 
                  #marker_color = 'rgb(50, 171, 96, 0.01)')
fig.update_layout(
    yaxis_range = [0, 500],
    autosize=False,
    width=600,
    height=500,
)
fig.show()

2.    Find the percentage of rejection for married male applicants.

In [21]:
approved = loan.select("Married", "Gender", "Application_Status")\
                .filter(loan["Married"]=='Yes')\
                .filter(loan["Gender"]=='Male')\
                .filter(loan["Application_Status"]=='Y')\
                .groupBy("Married","Gender", "Application_Status").count().collect()
rejected = loan.select("Married", "Gender", "Application_Status")\
                .filter(loan["Married"]=='Yes')\
                .filter(loan["Gender"]=='Male')\
                .filter(loan["Application_Status"]=='N')\
                .groupBy("Married","Gender", "Application_Status").count().collect()
approved, rejected
rejected[0]['count']/(approved[0]['count']+rejected[0]['count'])

0.28431372549019607

In [22]:
df_p = loan.select("Married", "Gender", "Application_Status")\
                .groupBy("Married","Gender", "Application_Status").count()\
                .withColumnRenamed("Application_Status", "Application Status")\
                .withColumnRenamed("count", "Count")\

df_p = df_p.toPandas()
df_p

Unnamed: 0,Married,Gender,Application Status,Count
0,No,Female,Y,40
1,Yes,Male,Y,219
2,Yes,Male,N,87
3,Yes,Female,Y,17
4,No,Female,N,26
5,No,Male,N,43
6,Yes,Female,N,8
7,No,Male,Y,71


In [28]:
fig = px.bar(df_p, x="Gender", 
                      y="Count", 
                      color="Application Status", 
                      facet_col="Married",
                      title="Loan Approvals based on Gender and Marital Status <br><sup>Percentage of rejection for married male applicants: %28</sup>",
                      labels={"Self_Employed":"Selp-Employement Status",
                      "Percentage":"Percentage of Application Status"})
fig.update_traces(width=0.5,) 
                  #marker_color = 'rgb(50, 171, 96, 0.01)')
fig.update_layout(
    yaxis_range = [0, 350],
    autosize=False,
    width=700,
    height=500,
)
fig.show()

3. Find and plot the top three months with the largest transaction data.

In [148]:
credit_month = credit.withColumn('MONTH', regexp_extract(col('TIMEID'), r"(\d{4})(\d{2})(\d{2})", 2))
credit_month.createTempView("credit_month_tb")


In [161]:
trans_count = spark.sql("SELECT MONTH as Month, COUNT(TRANSACTION_ID) AS `Number of Transactions` FROM credit_month_tb GROUP BY MONTH ORDER BY `Number of Transactions` DESC Limit 3").toPandas()
trans_value = spark.sql("SELECT MONTH as Month, ROUND(SUM(TRANSACTION_VALUE),2) AS `Total Transaction Value` FROM credit_month_tb GROUP BY MONTH ORDER BY `Total Transaction Value` DESC Limit 3").toPandas()

In [207]:
fig = make_subplots(
    rows=2, cols=2,
    specs=[[{"type": "bar"}, {"type": "bar"}],
           [{"type": "table"}, {"type": "table"}]],
    subplot_titles=('Number of Transactions',  'Total Value of Transactions'),
)
fig.add_trace(
    go.Bar(
        x=trans_count["Month"],
        y=trans_count["Number of Transactions"],
        
    ),
    row=1, col=1
)
fig.update_traces(width=0.5,)
fig.update_layout(yaxis_range=[3000,4500])
fig.add_trace(
    go.Bar(
        x=trans_value["Month"],
        y=trans_value["Total Transaction Value"],

    ),
    row=1, col=2
)
fig.update_traces(width=0.5,) 

fig.add_trace(
    go.Table(
        header=dict(
            values=trans_count.columns,
            font=dict(size=14),
            align="left"
        ),
        cells=dict(
            values=[trans_count[k].tolist() for k in trans_count.columns],
            align = "left")
    ),
    row=2, col=1
)
fig.add_trace(
    go.Table(
        header=dict(
            values=trans_value.columns,
            font=dict(size=14),
            align="left"
        ),
        cells=dict(
            values=[trans_value[k].tolist() for k in trans_value.columns],
            align = "left")
    ),
    row=2, col=2
)
fig.update_layout(
    height=600,
    width=900,
    showlegend=False,
    title_text="Top Three Months with the largest Transactions",
    #yaxis_range = [6000,7000]
)
fig['layout']['xaxis']['title'] = 'Month'
fig['layout']['xaxis2']['title'] = 'Month'
fig['layout']['yaxis']['title'] = 'Transaction Count'
fig['layout']['yaxis2']['title'] = 'Transaction Value'
fig.show()

4.    Find and plot which branch processed the highest total dollar value of healthcare transactions.

In [238]:
df_health = spark.sql("SELECT branch_tb.BRANCH_CODE as `Branch Code`, ROUND(SUM(credit_tb.TRANSACTION_VALUE),2) as `Total Healthcare Transactions`  FROM credit_tb JOIN branch_tb ON credit_tb.BRANCH_CODE = branch_tb.BRANCH_CODE WHERE credit_tb.TRANSACTION_TYPE = 'Healthcare' group by branch_tb.BRANCH_CODE order by `Total Healthcare Transactions` ASC Limit(5);").toPandas()
df_health["Branch"] = df_health["Branch Code"].astype("str")

In [256]:
fig = go.Figure()
fig.add_trace(
    go.Bar(
        y=df_health["Branch"],
        x=df_health["Total Healthcare Transactions"], 
        orientation="h",
)
)
fig.update_layout(
    height=400,
    width=600,
    showlegend=False,
    title_text="Branches with the Highest Healthcare Transactions",
    xaxis_range = [1000,2000],

    xaxis_title="Transaction totals (USD) ", yaxis_title="Branch Codes"

)
fig.show()

23/02/26 09:53:28 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 911214 ms exceeds timeout 120000 ms
23/02/26 09:53:28 WARN SparkContext: Killing executors is not supported by current scheduler.
