In [0]:
#Loading Data
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Case Study").getOrCreate()

loan_file_path = "/FileStore/tables/loan.csv"

loan_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(loan_file_path)

loan_df.show()
loan_df.createOrReplaceTempView("loan_data")



+-----------+---+------+-------------------+--------------+-----------+------+-----------+-------------+------------------+-----------+-------+------------+----------------+------------------+
|Customer_ID|Age|Gender|         Occupation|Marital Status|Family Size|Income|Expenditure|Use Frequency|     Loan Category|Loan Amount|Overdue| Debt Record| Returned Cheque| Dishonour of Bill|
+-----------+---+------+-------------------+--------------+-----------+------+-----------+-------------+------------------+-----------+-------+------------+----------------+------------------+
|    IB14001| 30|  MALE|       BANK MANAGER|        SINGLE|          4| 50000|      22199|            6|           HOUSING| 10,00,000 |      5|      42,898|               6|                 9|
|    IB14008| 44|  MALE|          PROFESSOR|       MARRIED|          6| 51000|      19999|            4|          SHOPPING|     50,000|      3|      33,999|               1|                 5|
|    IB14012| 30|FEMALE|           

In [0]:
 #Spark SQL

In [0]:
#Filter
#Find people with Loan Amount greater than 10,00,000
from pyspark.sql import functions as F

loan_df = loan_df.withColumn("Loan Amount", F.regexp_replace("Loan Amount", ",", "").cast("double"))

loan_df.createOrReplaceTempView("loan_data")

filtered_data = spark.sql("""
    SELECT * FROM loan_data
    WHERE `Loan Amount` > 10000
""")
filtered_data.show()


+-----------+---+------+-------------------+--------------+-----------+------+-----------+-------------+------------------+-----------+-------+------------+----------------+------------------+
|Customer_ID|Age|Gender|         Occupation|Marital Status|Family Size|Income|Expenditure|Use Frequency|     Loan Category|Loan Amount|Overdue| Debt Record| Returned Cheque| Dishonour of Bill|
+-----------+---+------+-------------------+--------------+-----------+------+-----------+-------------+------------------+-----------+-------+------------+----------------+------------------+
|    IB14001| 30|  MALE|       BANK MANAGER|        SINGLE|          4| 50000|      22199|            6|           HOUSING|  1000000.0|      5|      42,898|               6|                 9|
|    IB14008| 44|  MALE|          PROFESSOR|       MARRIED|          6| 51000|      19999|            4|          SHOPPING|    50000.0|      3|      33,999|               1|                 5|
|    IB14012| 30|FEMALE|           

In [0]:
 #inner join

# Sample loan approval data (For demonstration of joins)
loan_approval_data = [
    ('IB14001', 'Approved'),
    ('IB14008', 'Denied'),
    ('IB14012', 'Approved'),
    ('IB14018', 'Approved'),
    ('IB14022', 'Denied'),
    ('IB14024', 'Approved'),
]

#DataFrame for loan approval data
loan_approval_columns = ['Customer_ID', 'Approval_Status']
loan_approval_df = spark.createDataFrame(loan_approval_data, loan_approval_columns)

loan_approval_df.createOrReplaceTempView("loan_approval_data")

# Perform an inner join between loan_data and loan_approval_data
joined_data = spark.sql("""
    SELECT a.*, b.Approval_Status
    FROM loan_data a
    JOIN loan_approval_data b
    ON a.Customer_ID = b.Customer_ID
""")
joined_data.show()


+-----------+---+------+------------+--------------+-----------+------+-----------+-------------+-------------+-----------+-------+------------+----------------+------------------+---------------+
|Customer_ID|Age|Gender|  Occupation|Marital Status|Family Size|Income|Expenditure|Use Frequency|Loan Category|Loan Amount|Overdue| Debt Record| Returned Cheque| Dishonour of Bill|Approval_Status|
+-----------+---+------+------------+--------------+-----------+------+-----------+-------------+-------------+-----------+-------+------------+----------------+------------------+---------------+
|    IB14001| 30|  MALE|BANK MANAGER|        SINGLE|          4| 50000|      22199|            6|      HOUSING|  1000000.0|      5|      42,898|               6|                 9|       Approved|
|    IB14008| 44|  MALE|   PROFESSOR|       MARRIED|          6| 51000|      19999|            4|     SHOPPING|    50000.0|      3|      33,999|               1|                 5|         Denied|
|    IB14012| 3

In [0]:
# Perform a left join between loan_data and loan_approval_data
left_joined_data = spark.sql("""
    SELECT a.*, b.Approval_Status
    FROM loan_data a
    LEFT JOIN loan_approval_data b
    ON a.Customer_ID = b.Customer_ID
""")
left_joined_data.show()


+-----------+---+------+-------------------+--------------+-----------+------+-----------+-------------+------------------+-----------+-------+------------+----------------+------------------+---------------+
|Customer_ID|Age|Gender|         Occupation|Marital Status|Family Size|Income|Expenditure|Use Frequency|     Loan Category|Loan Amount|Overdue| Debt Record| Returned Cheque| Dishonour of Bill|Approval_Status|
+-----------+---+------+-------------------+--------------+-----------+------+-----------+-------------+------------------+-----------+-------+------------+----------------+------------------+---------------+
|    IB14001| 30|  MALE|       BANK MANAGER|        SINGLE|          4| 50000|      22199|            6|           HOUSING|  1000000.0|      5|      42,898|               6|                 9|       Approved|
|    IB14008| 44|  MALE|          PROFESSOR|       MARRIED|          6| 51000|      19999|            4|          SHOPPING|    50000.0|      3|      33,999|        

In [0]:
# Perform a right join between loan_data and loan_approval_data
right_joined_data = spark.sql("""
    SELECT a.*, b.Approval_Status
    FROM loan_data a
    RIGHT JOIN loan_approval_data b
    ON a.Customer_ID = b.Customer_ID
""")
right_joined_data.show()


+-----------+---+------+------------+--------------+-----------+------+-----------+-------------+-------------+-----------+-------+------------+----------------+------------------+---------------+
|Customer_ID|Age|Gender|  Occupation|Marital Status|Family Size|Income|Expenditure|Use Frequency|Loan Category|Loan Amount|Overdue| Debt Record| Returned Cheque| Dishonour of Bill|Approval_Status|
+-----------+---+------+------------+--------------+-----------+------+-----------+-------------+-------------+-----------+-------+------------+----------------+------------------+---------------+
|    IB14001| 30|  MALE|BANK MANAGER|        SINGLE|          4| 50000|      22199|            6|      HOUSING|  1000000.0|      5|      42,898|               6|                 9|       Approved|
|    IB14008| 44|  MALE|   PROFESSOR|       MARRIED|          6| 51000|      19999|            4|     SHOPPING|    50000.0|      3|      33,999|               1|                 5|         Denied|
|    IB14012| 3

In [0]:
# Perform a full outer join between loan_data and loan_approval_data
outer_joined_data = spark.sql("""
    SELECT a.*, b.Approval_Status
    FROM loan_data a
    FULL OUTER JOIN loan_approval_data b
    ON a.Customer_ID = b.Customer_ID
""")
outer_joined_data.show()


+-----------+---+------+-------------------+--------------+-----------+------+-----------+-------------+------------------+-----------+-------+------------+----------------+------------------+---------------+
|Customer_ID|Age|Gender|         Occupation|Marital Status|Family Size|Income|Expenditure|Use Frequency|     Loan Category|Loan Amount|Overdue| Debt Record| Returned Cheque| Dishonour of Bill|Approval_Status|
+-----------+---+------+-------------------+--------------+-----------+------+-----------+-------------+------------------+-----------+-------+------------+----------------+------------------+---------------+
|    1B14093| 21|FEMALE|            MANAGER|        SINGLE|          3| 42516|      24567|            7|        AUTOMOBILE|  2569874.0|      8|      89,652|               2|                 3|           NULL|
|    1B14094| 49|  MALE|ASSISTANT PROFESSOR|       MARRIED|          5| 65214|      42589|            5|           HOUSING|   985412.0|      5|      11,254|        

In [0]:
# Aggregation: Average Loan Amount per Occupation
avg_loan_per_occupation = spark.sql("""
    SELECT Occupation, AVG(`Loan Amount`) AS avg_loan
    FROM loan_data
    GROUP BY Occupation
""")
avg_loan_per_occupation.show()


+--------------------+------------------+
|          Occupation|          avg_loan|
+--------------------+------------------+
|      CIVIL ENGINEER| 819806.3333333334|
|     FIRE DEPARTMENT| 955125.1666666666|
|          ACCOUNTANT|1223623.2857142857|
|        BANK MANAGER| 629305.6071428572|
|      SYSTEM OFFICER|          290192.0|
|           NUTRITION|          456780.0|
|           DIETICIAN| 625974.4615384615|
|               CLERK| 633292.7307692308|
|   SOFTWARE ENGINEER|          755663.0|
|AGRICULTURAL ENGI...|          767338.0|
|   ASSISTANT MANAGER|          729638.5|
|             TEACHER| 681778.6349206349|
| ASSISTANT PROFESSOR| 577495.8888888889|
|     SYSTEM ENGINEER|          989510.0|
| CHARTERED APPRAISER|1023088.6363636364|
|                NAVY|          880523.0|
|              POLICE| 690967.7222222222|
|            BUSINESS|        952763.875|
|              FARMER|494617.14285714284|
|              DRIVER| 922293.3888888889|
+--------------------+------------

In [0]:
# GroupBy: Total Income by Marital Status
total_income_per_status = spark.sql("""
    SELECT `Marital Status`, SUM(Income) AS total_income
    FROM loan_data
    GROUP BY `Marital Status`
""")
total_income_per_status.show()


+--------------+------------+
|Marital Status|total_income|
+--------------+------------+
|        SINGLE|     8756569|
|       MARRIED|    23226313|
+--------------+------------+



In [0]:
# Filter and Aggregate: Total Loan Amount for SINGLE marital status
single_marital_status = spark.sql("""
    SELECT SUM(`Loan Amount`) AS total_loan
    FROM loan_data
    WHERE `Marital Status` = 'SINGLE'
""")
single_marital_status.show()


+------------+
|  total_loan|
+------------+
|1.12118685E8|
+------------+



In [0]:
#PySpark

In [0]:

# Filter: Loan Amount greater than 1,000,000
filtered_df = loan_df.filter(loan_df['Loan Amount'] > 1000000)
filtered_df.show()


+-----------+---+------+-----------------+--------------+-----------+------+-----------+-------------+------------------+-----------+-------+------------+----------------+------------------+
|Customer_ID|Age|Gender|       Occupation|Marital Status|Family Size|Income|Expenditure|Use Frequency|     Loan Category|Loan Amount|Overdue| Debt Record| Returned Cheque| Dishonour of Bill|
+-----------+---+------+-----------------+--------------+-----------+------+-----------+-------------+------------------+-----------+-------+------------+----------------+------------------+
|    IB14025| 39|FEMALE|          TEACHER|       MARRIED|          6| 46619|      18675|            4|           HOUSING|  1209867.0|      8|      29,999|               6|                 8|
|    IB14042| 25|FEMALE|           DOCTOR|        SINGLE|          4| 60111|      27111|            5|        TRAVELLING|  1290929.0|      4|      18,000|               1|                 0|
|    IB14050| 56|  MALE|   CIVIL ENGINEER|   

In [0]:
#inner join
inner_joined_data = loan_df.join(loan_approval_df, loan_df.Customer_ID == loan_approval_df.Customer_ID, "inner")

# Show the result
inner_joined_data.show()



+-----------+---+------+------------+--------------+-----------+------+-----------+-------------+-------------+-----------+-------+------------+----------------+------------------+-----------+---------------+
|Customer_ID|Age|Gender|  Occupation|Marital Status|Family Size|Income|Expenditure|Use Frequency|Loan Category|Loan Amount|Overdue| Debt Record| Returned Cheque| Dishonour of Bill|Customer_ID|Approval_Status|
+-----------+---+------+------------+--------------+-----------+------+-----------+-------------+-------------+-----------+-------+------------+----------------+------------------+-----------+---------------+
|    IB14001| 30|  MALE|BANK MANAGER|        SINGLE|          4| 50000|      22199|            6|      HOUSING|  1000000.0|      5|      42,898|               6|                 9|    IB14001|       Approved|
|    IB14008| 44|  MALE|   PROFESSOR|       MARRIED|          6| 51000|      19999|            4|     SHOPPING|    50000.0|      3|      33,999|               1|   

In [0]:
#left join
left_joined_data = loan_df.join(loan_approval_df, loan_df.Customer_ID == loan_approval_df.Customer_ID, "left")

# Show the result
left_joined_data.show()



+-----------+---+------+-------------------+--------------+-----------+------+-----------+-------------+------------------+-----------+-------+------------+----------------+------------------+-----------+---------------+
|Customer_ID|Age|Gender|         Occupation|Marital Status|Family Size|Income|Expenditure|Use Frequency|     Loan Category|Loan Amount|Overdue| Debt Record| Returned Cheque| Dishonour of Bill|Customer_ID|Approval_Status|
+-----------+---+------+-------------------+--------------+-----------+------+-----------+-------------+------------------+-----------+-------+------------+----------------+------------------+-----------+---------------+
|    IB14001| 30|  MALE|       BANK MANAGER|        SINGLE|          4| 50000|      22199|            6|           HOUSING|  1000000.0|      5|      42,898|               6|                 9|    IB14001|       Approved|
|    IB14008| 44|  MALE|          PROFESSOR|       MARRIED|          6| 51000|      19999|            4|          SH

In [0]:
#right join
right_joined_data = loan_df.join(loan_approval_df, loan_df.Customer_ID == loan_approval_df.Customer_ID, "right")

# Show the result
right_joined_data.show()


+-----------+---+------+------------+--------------+-----------+------+-----------+-------------+-------------+-----------+-------+------------+----------------+------------------+-----------+---------------+
|Customer_ID|Age|Gender|  Occupation|Marital Status|Family Size|Income|Expenditure|Use Frequency|Loan Category|Loan Amount|Overdue| Debt Record| Returned Cheque| Dishonour of Bill|Customer_ID|Approval_Status|
+-----------+---+------+------------+--------------+-----------+------+-----------+-------------+-------------+-----------+-------+------------+----------------+------------------+-----------+---------------+
|    IB14001| 30|  MALE|BANK MANAGER|        SINGLE|          4| 50000|      22199|            6|      HOUSING|  1000000.0|      5|      42,898|               6|                 9|    IB14001|       Approved|
|    IB14008| 44|  MALE|   PROFESSOR|       MARRIED|          6| 51000|      19999|            4|     SHOPPING|    50000.0|      3|      33,999|               1|   

In [0]:
#outer join
outer_joined_data = loan_df.join(loan_approval_df, loan_df.Customer_ID == loan_approval_df.Customer_ID, "outer")

# Show the result
outer_joined_data.show()


+-----------+---+------+-------------------+--------------+-----------+------+-----------+-------------+------------------+-----------+-------+------------+----------------+------------------+-----------+---------------+
|Customer_ID|Age|Gender|         Occupation|Marital Status|Family Size|Income|Expenditure|Use Frequency|     Loan Category|Loan Amount|Overdue| Debt Record| Returned Cheque| Dishonour of Bill|Customer_ID|Approval_Status|
+-----------+---+------+-------------------+--------------+-----------+------+-----------+-------------+------------------+-----------+-------+------------+----------------+------------------+-----------+---------------+
|    1B14093| 21|FEMALE|            MANAGER|        SINGLE|          3| 42516|      24567|            7|        AUTOMOBILE|  2569874.0|      8|      89,652|               2|                 3|       NULL|           NULL|
|    1B14094| 49|  MALE|ASSISTANT PROFESSOR|       MARRIED|          5| 65214|      42589|            5|           H

In [0]:
# Simple aggregations 
# Aggregating Total Loan Amount, Average Income, and Counting Customers
from pyspark.sql import functions as F

aggregated_data = loan_df.agg(
    F.sum("Loan Amount").alias("Total_Loan_Amount"),
    F.avg("Income").alias("Average_Income"),
    F.count("Customer_ID").alias("Customer_Count"),
    F.min("Income").alias("Min_Income"),
    F.max("Income").alias("Max_Income")
)

aggregated_data.show()


+-----------------+-----------------+--------------+----------+----------+
|Total_Loan_Amount|   Average_Income|Customer_Count|Min_Income|Max_Income|
+-----------------+-----------------+--------------+----------+----------+
|     3.98526449E8|68339.49145299145|           500|     28366|    930000|
+-----------------+-----------------+--------------+----------+----------+



In [0]:
# Grouping and aggregations
# Calculating Total Loan Amount and Average Income
grouped_data = loan_df.groupBy("Occupation").agg(
    F.sum("Loan Amount").alias("Total_Loan_Amount"),
    F.avg("Income").alias("Average_Income"),
    F.count("Customer_ID").alias("Customer_Count")
)

grouped_data.show()


+--------------------+-----------------+------------------+--------------+
|          Occupation|Total_Loan_Amount|    Average_Income|Customer_Count|
+--------------------+-----------------+------------------+--------------+
|      CIVIL ENGINEER|        4918838.0|60359.666666666664|             6|
|     FIRE DEPARTMENT|      1.1461502E7|55357.916666666664|            12|
|          ACCOUNTANT|        8565363.0| 56623.28571428572|             7|
|        BANK MANAGER|      1.7620557E7|           92191.0|            28|
|      SYSTEM OFFICER|        1160768.0|           56780.0|             4|
|           NUTRITION|         456780.0|           55650.0|             1|
|           DIETICIAN|        8137668.0| 72599.16666666667|            13|
|               CLERK|      1.6465611E7|         76871.125|            26|
|   SOFTWARE ENGINEER|      2.6448205E7|           61107.8|            35|
|AGRICULTURAL ENGI...|        6138704.0|         82060.625|             8|
|   ASSISTANT MANAGER|   

In [0]:
# Filtering and  aggregation
# Filtering Customers with Income > 50,000 and Calculating Total Loan Amount and Average Expenditure
filtered_aggregated_data = loan_df.filter(loan_df.Income > 50000).agg(
    F.sum("Loan Amount").alias("Total_Loan_Amount"),
    F.avg("Expenditure").alias("Average_Expenditure")
)

filtered_aggregated_data.show()


+-----------------+-------------------+
|Total_Loan_Amount|Average_Expenditure|
+-----------------+-------------------+
|     2.61067242E8| 30574.736263736264|
+-----------------+-------------------+

