In [None]:
# ETL stands for

# Extract: extract the data from the different sources

# Transform: Transform the unstructured data into structured data. Transformations like cleaning, manipulation, etc.

# Load : Load the transformed data into a location or date warehouse.


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,concat,lit,floor,rand
spark = SparkSession.builder.appName("ETLPractice").getOrCreate()
source_path = "/content/LoanData.csv"
target_path = "/content/LoanDataResult.csv"
load_data = spark.read.csv("/content/LoanData.csv",header = True, inferSchema = True)


In [None]:
load_data.columns
load_data.show(5)


+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+
| Loan_ID|Gender|Married|Dependents|   Education|Self_Employed|ApplicantIncome|CoapplicantIncome|LoanAmount|Loan_Amount_Term|Credit_History|Property_Area|Loan_Status|
+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+
|LP001002|  Male|     No|         0|    Graduate|           No|           5849|              0.0|      NULL|             360|             1|        Urban|          Y|
|LP001003|  Male|    Yes|         1|    Graduate|           No|           4583|           1508.0|       128|             360|             1|        Rural|          N|
|LP001005|  Male|    Yes|         0|    Graduate|          Yes|           3000|              0.0|        66|             360|             1|        Urban|          Y

In [None]:
 #Transformation 1: Concatenate First and Last Names
load_data = load_data.withColumn('Loan_detail', concat(col('Loan_ID'), lit(' '), col('ApplicantIncome')))
load_data.show(5)

+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+-------------+-------------+
| Loan_ID|Gender|Married|Dependents|   Education|Self_Employed|ApplicantIncome|CoapplicantIncome|LoanAmount|Loan_Amount_Term|Credit_History|Property_Area|Loan_Status|  full_detail|  Loan_detail|
+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+-------------+-------------+
|LP001002|  Male|     No|         0|    Graduate|           No|           5849|              0.0|      NULL|             360|             1|        Urban|          Y|LP001002 5849|LP001002 5849|
|LP001003|  Male|    Yes|         1|    Graduate|           No|           4583|           1508.0|       128|             360|             1|        Rural|          N|LP001003 4583|LP001003 4583|
|LP001005|  Male|    Yes|

In [None]:
# Transformation 2: Calculate Net Salary (subtract 10% as taxes)
load_data = load_data.withColumn('net_salary', floor(lit(10000) + rand() * lit(50)))
load_data.show(10)

+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+--------------+--------------+----------+
| Loan_ID|Gender|Married|Dependents|   Education|Self_Employed|ApplicantIncome|CoapplicantIncome|LoanAmount|Loan_Amount_Term|Credit_History|Property_Area|Loan_Status|   full_detail|   Loan_detail|net_salary|
+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+--------------+--------------+----------+
|LP001002|  Male|     No|         0|    Graduate|           No|           5849|              0.0|      NULL|             360|             1|        Urban|          Y| LP001002 5849| LP001002 5849|     10018|
|LP001003|  Male|    Yes|         1|    Graduate|           No|           4583|           1508.0|       128|             360|             1|        Rural|          N| L

In [None]:
#adding age column
load_data = load_data.withColumn('age', floor(lit(20) + rand() * lit(31)))
load_data.show(10)

+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+--------------+--------------+----------+---+
| Loan_ID|Gender|Married|Dependents|   Education|Self_Employed|ApplicantIncome|CoapplicantIncome|LoanAmount|Loan_Amount_Term|Credit_History|Property_Area|Loan_Status|   full_detail|   Loan_detail|net_salary|age|
+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+--------------+--------------+----------+---+
|LP001002|  Male|     No|         0|    Graduate|           No|           5849|              0.0|      NULL|             360|             1|        Urban|          Y| LP001002 5849| LP001002 5849|     10018| 43|
|LP001003|  Male|    Yes|         1|    Graduate|           No|           4583|           1508.0|       128|             360|             1|        Rura

In [None]:
# # Transformation 3: Filter by Age (age >= 30)
load_data = load_data.filter(col('age')>= 30)
load_data.show()


+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+--------------+--------------+----------+---+
| Loan_ID|Gender|Married|Dependents|   Education|Self_Employed|ApplicantIncome|CoapplicantIncome|LoanAmount|Loan_Amount_Term|Credit_History|Property_Area|Loan_Status|   full_detail|   Loan_detail|net_salary|age|
+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+--------------+--------------+----------+---+
|LP001002|  Male|     No|         0|    Graduate|           No|           5849|              0.0|      NULL|             360|             1|        Urban|          Y| LP001002 5849| LP001002 5849|     10018| 43|
|LP001006|  Male|    Yes|         0|Not Graduate|           No|           2583|           2358.0|       120|             360|             1|        Urba

In [None]:
# Transformation 4: Group by Age and Calculate Average Salary
avg_salary_by_age = load_data.groupBy('age').agg({'net_salary' :'avg'}).withColumnRenamed('avg(salary)', 'avg_salary')
avg_salary_by_age.show()

+---+------------------+
|age|   avg(net_salary)|
+---+------------------+
| 34|10027.666666666666|
| 50|10030.842105263158|
| 43|           10022.3|
| 32|10024.185185185184|
| 31|10021.333333333334|
| 39|10021.583333333334|
| 41| 10028.78947368421|
| 33|10024.533333333333|
| 48|           10025.4|
| 44|10023.611111111111|
| 37| 10017.57142857143|
| 49|10023.181818181818|
| 35|10030.166666666666|
| 36|10023.695652173914|
| 38|           10031.0|
| 42| 10025.91304347826|
| 30|           10028.0|
| 46| 10028.42857142857|
| 40|           10021.0|
| 47|10021.793103448275|
+---+------------------+
only showing top 20 rows



In [None]:
load_data = load_data.orderBy("age")
load_data.show()

+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+--------------+--------------+----------+---+
| Loan_ID|Gender|Married|Dependents|   Education|Self_Employed|ApplicantIncome|CoapplicantIncome|LoanAmount|Loan_Amount_Term|Credit_History|Property_Area|Loan_Status|   full_detail|   Loan_detail|net_salary|age|
+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+--------------+--------------+----------+---+
|LP001964|  Male|    Yes|         0|Not Graduate|           No|           1800|           2934.0|        93|             360|             0|        Urban|          N| LP001964 1800| LP001964 1800|     10025| 30|
|LP001032|  Male|     No|         0|    Graduate|           No|           4950|              0.0|       125|             360|             1|        Urba

In [None]:
# Save the transformed data to an external CSV file
load_data.write.csv(target_path, mode='overwrite', header=True)