In [0]:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType
data2 = [("James","","Smith","36636","M",1500),
    ("Michael","Rose","Schmidt","40288","M",1950),
    ("Robert","","Williams","42114","M",3800),
    ("Maria","Anne","Jones","39192","F",1100),
    ("Jen","Mary","Brown","24687","F",6000)
  ]

schema = StructType([ \
    StructField("firstname",StringType(),True), \
    StructField("middlename",StringType(),True), \
    StructField("lastname",StringType(),True), \
    StructField("id", StringType(), True), \
    StructField("gender", StringType(), True), \
    StructField("salary", IntegerType(), True) \
  ])
 
df = spark.createDataFrame(data=data2,schema=schema)
df.show(truncate=False)

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|id   |gender|salary|
+---------+----------+--------+-----+------+------+
|James    |          |Smith   |36636|M     |1500  |
|Michael  |Rose      |Schmidt |40288|M     |1950  |
|Robert   |          |Williams|42114|M     |3800  |
|Maria    |Anne      |Jones   |39192|F     |1100  |
|Jen      |Mary      |Brown   |24687|F     |6000  |
+---------+----------+--------+-----+------+------+



In [0]:
# Challenge_1 (Pyspark): Create a new column called 'email_address'. The email addres must follow the following format: lastname_firstname@bigdata.com.
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType

def generate_email(first, last):
    return f"{last}_{first}@bigdata.com".lower()

emailUDF = udf(generate_email ,StringType())   

df.withColumn("email_address", emailUDF(col("firstname"), col("lastname"))).show(truncate=False)

+---------+----------+--------+-----+------+------+---------------------------+
|firstname|middlename|lastname|id   |gender|salary|email_address              |
+---------+----------+--------+-----+------+------+---------------------------+
|James    |          |Smith   |36636|M     |1500  |smith_james@bigdata.com    |
|Michael  |Rose      |Schmidt |40288|M     |1950  |schmidt_michael@bigdata.com|
|Robert   |          |Williams|42114|M     |3800  |williams_robert@bigdata.com|
|Maria    |Anne      |Jones   |39192|F     |1100  |jones_maria@bigdata.com    |
|Jen      |Mary      |Brown   |24687|F     |6000  |brown_jen@bigdata.com      |
+---------+----------+--------+-----+------+------+---------------------------+



In [0]:
# Challenge_2 (Pandas): Consider the following income tax bracket:
# +----------------+-------+
# | Monthly income | Tax % |
# +----------------+-------+
# | 0-1200         |   5   |
# | 1201-1700      |   8   |
# | 1701-2400      |   10  |
# | 2401-4000      |   12  |
# | 4001-7000      |   14  |
# +----------------+-------+
#Create a new column called 'calculated_taxes'. Calculate the how much each person will pay of taxes based on their income.
import pandas as pd

def condition(income):
    if income < 1201:
        return income * 0.05
    elif income < 1701:
        return income * 0.08
    elif income < 2401:
        return income * 0.1
    elif income < 4001:
        return income * 0.12
    else:
        return income * 0.14
    
@pandas_udf("long")
def calculate(salary: pd.Series) -> pd.Series:
    return pd.Series([salary.apply(condition)])

df.withColumn('calculated_taxes', calculate(col("salary"))).show()

+---------+----------+--------+-----+------+------+----------------+
|firstname|middlename|lastname|   id|gender|salary|calculated_taxes|
+---------+----------+--------+-----+------+------+----------------+
|    James|          |   Smith|36636|     M|  1500|             120|
|  Michael|      Rose| Schmidt|40288|     M|  1950|             195|
|   Robert|          |Williams|42114|     M|  3800|             456|
|    Maria|      Anne|   Jones|39192|     F|  1100|              55|
|      Jen|      Mary|   Brown|24687|     F|  6000|             840|
+---------+----------+--------+-----+------+------+----------------+

