In [0]:
# Spark initialising

from pyspark import SparkContext

from pyspark.sql import SparkSession

sc=SparkContext.getOrCreate()

spark=SparkSession.builder.appName('coding challenge').getOrCreate()

In [0]:
# Loading data
loans_df = spark.read.csv("/FileStore/tables/loan.csv", header=True, inferSchema=True)

# Display the schema
loans_df.printSchema()

# Shows the first few rows of the data
loans_df.show(5)


root
 |-- Customer_ID: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- Marital Status: string (nullable = true)
 |-- Family Size: integer (nullable = true)
 |-- Income: integer (nullable = true)
 |-- Expenditure: integer (nullable = true)
 |-- Use Frequency: integer (nullable = true)
 |-- Loan Category: string (nullable = true)
 |-- Loan Amount: string (nullable = true)
 |-- Overdue: integer (nullable = true)
 |--  Debt Record: string (nullable = true)
 |--  Returned Cheque: integer (nullable = true)
 |--  Dishonour of Bill: integer (nullable = true)

+-----------+---+------+------------+--------------+-----------+------+-----------+-------------+-------------+-----------+-------+------------+----------------+------------------+
|Customer_ID|Age|Gender|  Occupation|Marital Status|Family Size|Income|Expenditure|Use Frequency|Loan Category|Loan Amount|Overdue| Debt Record| Returned Cheque| D

Filter

In [0]:
from pyspark.sql.functions import col, avg, sum, count

# Filter customers who have overdue loans
overdue_loans = loans_df.filter(col("Overdue") == "1")

# Display the overdue loans
print("Customers with overdue loans:")
overdue_loans.show()



Customers with overdue loans:
+-----------+---+------+-------------------+--------------+-----------+------+-----------+-------------+-------------+-----------+-------+------------+----------------+------------------+
|Customer_ID|Age|Gender|         Occupation|Marital Status|Family Size|Income|Expenditure|Use Frequency|Loan Category|Loan Amount|Overdue| Debt Record| Returned Cheque| Dishonour of Bill|
+-----------+---+------+-------------------+--------------+-----------+------+-----------+-------------+-------------+-----------+-------+------------+----------------+------------------+
|    IB14024| 55|FEMALE|              NURSE|       MARRIED|          6| 34999|      19888|            4|   AUTOMOBILE|     47,787|      1|      50,000|               0|                 3|
|    IB14037| 54|FEMALE|            TEACHER|       MARRIED|          5| 48099|      19999|            4|  RESTAURANTS|     30,999|      1|      12,000|               7|                 5|
|    IB14045| 31|  MALE|      

In [0]:
loans_df = spark.read.csv("/FileStore/tables/loan.csv", header=True, inferSchema=True)

# Register the DataFrame as a temporary SQL view
loans_df.createOrReplaceTempView("loans")


In [0]:
%sql
-- Filter customers who have overdue loans
SELECT *
FROM loans
WHERE Overdue = '1'


Customer_ID,Age,Gender,Occupation,Marital Status,Family Size,Income,Expenditure,Use Frequency,Loan Category,Loan Amount,Overdue,Debt Record,Returned Cheque,Dishonour of Bill
IB14024,55,FEMALE,NURSE,MARRIED,6,34999.0,19888.0,4,AUTOMOBILE,47787,1,50000,0,3
IB14037,54,FEMALE,TEACHER,MARRIED,5,48099.0,19999.0,4,RESTAURANTS,30999,1,12000,7,5
IB14045,31,MALE,STORE KEEPER,SINGLE,5,40999.0,11999.0,3,BOOK STORES,167654,1,4500,0,1
IB14078,45,FEMALE,FIRE DEPARTMENT,MARRIED,4,40000.0,18888.0,4,AUTOMOBILE,70000,1,0,2,1
IB14104,54,MALE,AIRPORT OFFICER,MARRIED,6,80000.0,32541.0,2,AUTOMOBILE,2045789,1,16599,2,3
IB14158,54,MALE,AIRPORT OFFICER,MARRIED,6,80000.0,62541.0,2,AUTOMOBILE,2045789,1,16599,2,3
IB14176,54,MALE,AIRPORT OFFICER,MARRIED,6,80000.0,62541.0,2,HOUSING,2045789,1,16599,2,3
IB14194,55,FEMALE,NURSE,MARRIED,6,34999.0,19888.0,4,TRAVELLING,47787,1,50000,0,3
IB14197,54,FEMALE,TEACHER,MARRIED,5,48099.0,19999.0,4,RESTAURANTS,300999,1,12000,7,5
IB14204,54,MALE,AIRPORT OFFICER,MARRIED,6,81000.0,62541.0,2,DINNING,2045789,1,16599,2,3


Filter with Multiple Conditions

In [0]:
# Filter for single customers with income > 5000
single_high_income_customers = loans_df.filter(
    (col("Marital Status") == "SINGLE") & (col("Income") > 5000)
)

# Display the result
print("Single customers with high income:")
single_high_income_customers.show()


Single customers with high income:
+-----------+---+------+-----------------+--------------+-----------+------+-----------+-------------+-------------+-----------+-------+------------+----------------+------------------+
|Customer_ID|Age|Gender|       Occupation|Marital Status|Family Size|Income|Expenditure|Use Frequency|Loan Category|Loan Amount|Overdue| Debt Record| Returned Cheque| Dishonour of Bill|
+-----------+---+------+-----------------+--------------+-----------+------+-----------+-------------+-------------+-----------+-------+------------+----------------+------------------+
|    IB14001| 30|  MALE|     BANK MANAGER|        SINGLE|          4| 50000|      22199|            6|      HOUSING| 10,00,000 |      5|      42,898|               6|                 9|
|    IB14012| 30|FEMALE|          DENTIST|        SINGLE|          3| 58450|      27675|            5|   TRAVELLING|     75,000|      6|      20,876|               3|                 1|
|    IB14022| 34|  MALE|           

In [0]:
%sql
SELECT *
FROM loans
WHERE `Marital Status` = 'SINGLE' AND Income > 5000


Customer_ID,Age,Gender,Occupation,Marital Status,Family Size,Income,Expenditure,Use Frequency,Loan Category,Loan Amount,Overdue,Debt Record,Returned Cheque,Dishonour of Bill
IB14001,30,MALE,BANK MANAGER,SINGLE,4,50000,22199.0,6,HOUSING,1000000,5,42898,6,9
IB14012,30,FEMALE,DENTIST,SINGLE,3,58450,27675.0,5,TRAVELLING,75000,6,20876,3,1
IB14022,34,MALE,POLICE,SINGLE,4,43521,11999.0,3,AUTOMOBILE,200000,2,43898,1,2
IB14029,24,FEMALE,TEACHER,SINGLE,3,45008,17454.0,4,AUTOMOBILE,399435,9,51987,4,7
IB14032,24,MALE,DATA ANALYST,SINGLE,4,60111,28999.0,6,AUTOMOBILE,35232,5,33333,1,2
IB14042,25,FEMALE,DOCTOR,SINGLE,4,60111,27111.0,5,TRAVELLING,1290929,4,18000,1,0
IB14045,31,MALE,STORE KEEPER,SINGLE,5,40999,11999.0,3,BOOK STORES,167654,1,4500,0,1
IB14057,25,MALE,AIRPORT OFFICER,SINGLE,4,40000,18888.0,3,RESTAURANTS,400000,8,11111,1,7
IB14079,25,MALE,CLERK,SINGLE,3,35000,9000.0,2,GOLD LOAN,100000,3,12584,1,2
IB14089,25,MALE,PROFESSOR,SINGLE,5,62145,31254.0,4,BOOK STORES,1245789,6,48596,6,5


GroupBy and Count

In [0]:
# Count customers by Occupation
customer_count_by_occupation = loans_df.groupBy("Occupation").count()

# Display the result
print("Number of customers by occupation:")
customer_count_by_occupation.show()


Number of customers by occupation:
+--------------------+-----+
|          Occupation|count|
+--------------------+-----+
|      CIVIL ENGINEER|    6|
|     FIRE DEPARTMENT|   12|
|          ACCOUNTANT|    7|
|        BANK MANAGER|   28|
|      SYSTEM OFFICER|    4|
|           NUTRITION|    1|
|           DIETICIAN|   13|
|               CLERK|   26|
|   SOFTWARE ENGINEER|   35|
|AGRICULTURAL ENGI...|    8|
|   ASSISTANT MANAGER|    6|
|             TEACHER|   63|
| ASSISTANT PROFESSOR|    9|
|     SYSTEM ENGINEER|    3|
| CHARTERED APPRAISER|   11|
|                NAVY|   16|
|              POLICE|   18|
|            BUSINESS|   16|
|              FARMER|    7|
|              DRIVER|   18|
+--------------------+-----+
only showing top 20 rows



In [0]:
%sql
SELECT Occupation, COUNT(*) AS Customer_Count
FROM loans
GROUP BY Occupation


Occupation,Customer_Count
CIVIL ENGINEER,6
FIRE DEPARTMENT,12
ACCOUNTANT,7
BANK MANAGER,28
SYSTEM OFFICER,4
NUTRITION,1
DIETICIAN,13
CLERK,26
SOFTWARE ENGINEER,35
AGRICULTURAL ENGINEER,8


Aggregation

In [0]:
# Total loan amount and average income grouped by marital status
loan_income_stats = loans_df.groupBy("Marital Status").agg(
    sum("Loan Amount").alias("Total_Loan_Amount"),
    avg("Income").alias("Average Income")
)

# Display the result
print("Loan and income statistics by marital status:")
loan_income_stats.show()


Loan and income statistics by marital status:
+--------------+-----------------+-----------------+
|Marital Status|Total_Loan_Amount|   Average Income|
+--------------+-----------------+-----------------+
|        SINGLE|             null|61234.74825174825|
|       MARRIED|             null|71465.57846153846|
+--------------+-----------------+-----------------+



In [0]:
%sql
SELECT 'Marital Status', 
       SUM('Loan Amount') AS Total_Loan_Amount, 
       AVG(Income) AS Average_Income
FROM loans
GROUP BY 'Marital Status'


Marital Status,Total_Loan_Amount,Average_Income
Marital Status,,68339.49145299145


join

In [0]:
# Load another dataset
credit_df = spark.read.csv("/FileStore/tables/credit_card.csv", header=True, inferSchema=True)

# Inner join on CustomerId and Customer_ID
inner_join_df = credit_df.join(loans_df, credit_df.CustomerId == loans_df.Customer_ID, how="inner")

# Display 10 rows from the inner join
print("Inner Join Result:")
inner_join_df.show(10)



Inner Join Result:
+---------+----------+-------+-----------+---------+------+---+------+-------+-------------+--------------+---------------+------+-----------+---+------+----------+--------------+-----------+------+-----------+-------------+-------------+-----------+-------+------------+----------------+------------------+
|RowNumber|CustomerId|Surname|CreditScore|Geography|Gender|Age|Tenure|Balance|NumOfProducts|IsActiveMember|EstimatedSalary|Exited|Customer_ID|Age|Gender|Occupation|Marital Status|Family Size|Income|Expenditure|Use Frequency|Loan Category|Loan Amount|Overdue| Debt Record| Returned Cheque| Dishonour of Bill|
+---------+----------+-------+-----------+---------+------+---+------+-------+-------------+--------------+---------------+------+-----------+---+------+----------+--------------+-----------+------+-----------+-------------+-------------+-----------+-------+------------+----------------+------------------+
+---------+----------+-------+-----------+---------+-----

In [0]:
# Register credit_df as a temporary view
credit_df.createOrReplaceTempView("credit")


In [0]:
%sql
SELECT *
FROM credit c
INNER JOIN loans l
ON c.CustomerId = l.Customer_ID
LIMIT 10


RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,IsActiveMember,EstimatedSalary,Exited,Customer_ID,Age.1,Gender.1,Occupation,Marital Status,Family Size,Income,Expenditure,Use Frequency,Loan Category,Loan Amount,Overdue,Debt Record,Returned Cheque,Dishonour of Bill


In [0]:
# Outer join on CustomerId and Customer_ID
outer_join_df = credit_df.join(loans_df, credit_df.CustomerId == loans_df.Customer_ID, how="outer")

# Display 10 rows from the outer join
print("Outer Join Result:")
outer_join_df.show(10)


Outer Join Result:
+---------+----------+-------+-----------+---------+------+----+------+-------+-------------+--------------+---------------+------+-----------+---+------+-----------------+--------------+-----------+------+-----------+-------------+-------------+-----------+-------+------------+----------------+------------------+
|RowNumber|CustomerId|Surname|CreditScore|Geography|Gender| Age|Tenure|Balance|NumOfProducts|IsActiveMember|EstimatedSalary|Exited|Customer_ID|Age|Gender|       Occupation|Marital Status|Family Size|Income|Expenditure|Use Frequency|Loan Category|Loan Amount|Overdue| Debt Record| Returned Cheque| Dishonour of Bill|
+---------+----------+-------+-----------+---------+------+----+------+-------+-------------+--------------+---------------+------+-----------+---+------+-----------------+--------------+-----------+------+-----------+-------------+-------------+-----------+-------+------------+----------------+------------------+
|     null|      null|   null|   

In [0]:
%sql
SELECT *
FROM credit c
FULL OUTER JOIN loans l
ON c.CustomerId = l.Customer_ID
LIMIT 10


RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,IsActiveMember,EstimatedSalary,Exited,Customer_ID,Age.1,Gender.1,Occupation,Marital Status,Family Size,Income,Expenditure,Use Frequency,Loan Category,Loan Amount,Overdue,Debt Record,Returned Cheque,Dishonour of Bill
,,,,,,,,,,,,,IB14001,30,MALE,BANK MANAGER,SINGLE,4,50000,22199,6,HOUSING,1000000,5,42898,6,9
,,,,,,,,,,,,,IB14008,44,MALE,PROFESSOR,MARRIED,6,51000,19999,4,SHOPPING,50000,3,33999,1,5
,,,,,,,,,,,,,IB14012,30,FEMALE,DENTIST,SINGLE,3,58450,27675,5,TRAVELLING,75000,6,20876,3,1
,,,,,,,,,,,,,IB14018,29,MALE,TEACHER,MARRIED,5,45767,12787,3,GOLD LOAN,600000,7,11000,0,4
,,,,,,,,,,,,,IB14022,34,MALE,POLICE,SINGLE,4,43521,11999,3,AUTOMOBILE,200000,2,43898,1,2
,,,,,,,,,,,,,IB14024,55,FEMALE,NURSE,MARRIED,6,34999,19888,4,AUTOMOBILE,47787,1,50000,0,3
,,,,,,,,,,,,,IB14025,39,FEMALE,TEACHER,MARRIED,6,46619,18675,4,HOUSING,1209867,8,29999,6,8
,,,,,,,,,,,,,IB14027,51,MALE,SYSTEM MANAGER,MARRIED,3,49999,19111,5,RESTAURANTS,60676,8,13000,2,5
,,,,,,,,,,,,,IB14029,24,FEMALE,TEACHER,SINGLE,3,45008,17454,4,AUTOMOBILE,399435,9,51987,4,7
,,,,,,,,,,,,,IB14031,37,FEMALE,SOFTWARE ENGINEER,MARRIED,5,55999,23999,5,AUTOMOBILE,60999,2,0,5,3


In [0]:
# Left join on CustomerId and Customer_ID
left_join_df = credit_df.join(loans_df, credit_df.CustomerId == loans_df.Customer_ID, how="left")

# Display 10 rows from the left join
print("Left Join Result:")
left_join_df.show(10)


Left Join Result:
+---------+----------+--------+-----------+---------+------+---+------+---------+-------------+--------------+---------------+------+-----------+----+------+----------+--------------+-----------+------+-----------+-------------+-------------+-----------+-------+------------+----------------+------------------+
|RowNumber|CustomerId| Surname|CreditScore|Geography|Gender|Age|Tenure|  Balance|NumOfProducts|IsActiveMember|EstimatedSalary|Exited|Customer_ID| Age|Gender|Occupation|Marital Status|Family Size|Income|Expenditure|Use Frequency|Loan Category|Loan Amount|Overdue| Debt Record| Returned Cheque| Dishonour of Bill|
+---------+----------+--------+-----------+---------+------+---+------+---------+-------------+--------------+---------------+------+-----------+----+------+----------+--------------+-----------+------+-----------+-------------+-------------+-----------+-------+------------+----------------+------------------+
|        1|  15634602|Hargrave|        619|   

In [0]:
%sql
SELECT *
FROM credit c
LEFT JOIN loans l
ON c.CustomerId = l.Customer_ID
LIMIT 10


RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,IsActiveMember,EstimatedSalary,Exited,Customer_ID,Age.1,Gender.1,Occupation,Marital Status,Family Size,Income,Expenditure,Use Frequency,Loan Category,Loan Amount,Overdue,Debt Record,Returned Cheque,Dishonour of Bill
1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,101348.88,1,,,,,,,,,,,,,,,
2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,1,112542.58,0,,,,,,,,,,,,,,,
3,15619304,Onio,502,France,Female,42,8,159660.8,3,0,113931.57,1,,,,,,,,,,,,,,,
4,15701354,Boni,699,France,Female,39,1,0.0,2,0,93826.63,0,,,,,,,,,,,,,,,
5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,79084.1,0,,,,,,,,,,,,,,,
6,15574012,Chu,645,Spain,Male,44,8,113755.78,2,0,149756.71,1,,,,,,,,,,,,,,,
7,15592531,Bartlett,822,France,Male,50,7,0.0,2,1,10062.8,0,,,,,,,,,,,,,,,
8,15656148,Obinna,376,Germany,Female,29,4,115046.74,4,0,119346.88,1,,,,,,,,,,,,,,,
9,15792365,He,501,France,Male,44,4,142051.07,2,1,74940.5,0,,,,,,,,,,,,,,,
10,15592389,H?,684,France,Male,27,2,134603.88,1,1,71725.73,0,,,,,,,,,,,,,,,


In [0]:
# Right join on CustomerId and Customer_ID
right_join_df = credit_df.join(loans_df, credit_df.CustomerId == loans_df.Customer_ID, how="right")

# Display 10 rows from the right join
print("Right Join Result:")
right_join_df.show(10)


Right Join Result:
+---------+----------+-------+-----------+---------+------+----+------+-------+-------------+--------------+---------------+------+-----------+---+------+-----------------+--------------+-----------+------+-----------+-------------+-------------+-----------+-------+------------+----------------+------------------+
|RowNumber|CustomerId|Surname|CreditScore|Geography|Gender| Age|Tenure|Balance|NumOfProducts|IsActiveMember|EstimatedSalary|Exited|Customer_ID|Age|Gender|       Occupation|Marital Status|Family Size|Income|Expenditure|Use Frequency|Loan Category|Loan Amount|Overdue| Debt Record| Returned Cheque| Dishonour of Bill|
+---------+----------+-------+-----------+---------+------+----+------+-------+-------------+--------------+---------------+------+-----------+---+------+-----------------+--------------+-----------+------+-----------+-------------+-------------+-----------+-------+------------+----------------+------------------+
|     null|      null|   null|   

In [0]:
%sql
SELECT *
FROM credit c
RIGHT JOIN loans l
ON c.CustomerId = l.Customer_ID
LIMIT 10


RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,IsActiveMember,EstimatedSalary,Exited,Customer_ID,Age.1,Gender.1,Occupation,Marital Status,Family Size,Income,Expenditure,Use Frequency,Loan Category,Loan Amount,Overdue,Debt Record,Returned Cheque,Dishonour of Bill
,,,,,,,,,,,,,IB14001,30,MALE,BANK MANAGER,SINGLE,4,50000,22199,6,HOUSING,1000000,5,42898,6,9
,,,,,,,,,,,,,IB14008,44,MALE,PROFESSOR,MARRIED,6,51000,19999,4,SHOPPING,50000,3,33999,1,5
,,,,,,,,,,,,,IB14012,30,FEMALE,DENTIST,SINGLE,3,58450,27675,5,TRAVELLING,75000,6,20876,3,1
,,,,,,,,,,,,,IB14018,29,MALE,TEACHER,MARRIED,5,45767,12787,3,GOLD LOAN,600000,7,11000,0,4
,,,,,,,,,,,,,IB14022,34,MALE,POLICE,SINGLE,4,43521,11999,3,AUTOMOBILE,200000,2,43898,1,2
,,,,,,,,,,,,,IB14024,55,FEMALE,NURSE,MARRIED,6,34999,19888,4,AUTOMOBILE,47787,1,50000,0,3
,,,,,,,,,,,,,IB14025,39,FEMALE,TEACHER,MARRIED,6,46619,18675,4,HOUSING,1209867,8,29999,6,8
,,,,,,,,,,,,,IB14027,51,MALE,SYSTEM MANAGER,MARRIED,3,49999,19111,5,RESTAURANTS,60676,8,13000,2,5
,,,,,,,,,,,,,IB14029,24,FEMALE,TEACHER,SINGLE,3,45008,17454,4,AUTOMOBILE,399435,9,51987,4,7
,,,,,,,,,,,,,IB14031,37,FEMALE,SOFTWARE ENGINEER,MARRIED,5,55999,23999,5,AUTOMOBILE,60999,2,0,5,3
