## Basic Structured Operations


### Step 1: Initialize PySpark Session


In [209]:
import findspark
findspark.init()
from pyspark.sql import SparkSession


# Create a Spark sessionoccupation.createOrReplaceTempView("occupation_view_2")

spark = SparkSession.builder.appName("day2").getOrCreate()


### Step 2: Load the Dataset


In [210]:
# Load the Chipotle dataset into a Spark DataFrame
data_path = "../data/occupation.csv"  # Replace with the actual path
occupation = spark.read.csv(data_path, header=True, inferSchema=True)





In [211]:
occupation.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- zip_code: string (nullable = true)



### Problem 1: Selecting Specific Columns
Problem: Select the "user_id," "age," and "occupation" columns from the occupation DataFrame.

In [212]:

print(occupation.columns)
result = occupation.select('user_id','age','occupation')
result.show()

['user_id', 'age', 'gender', 'occupation', 'zip_code']
+-------+---+-------------+
|user_id|age|   occupation|
+-------+---+-------------+
|      1| 24|   technician|
|      2| 53|        other|
|      3| 23|       writer|
|      4| 24|   technician|
|      5| 33|        other|
|      6| 42|    executive|
|      7| 57|administrator|
|      8| 36|administrator|
|      9| 29|      student|
|     10| 53|       lawyer|
|     11| 39|        other|
|     12| 28|        other|
|     13| 47|     educator|
|     14| 45|    scientist|
|     15| 49|     educator|
|     16| 21|entertainment|
|     17| 30|   programmer|
|     18| 35|        other|
|     19| 40|    librarian|
|     20| 42|    homemaker|
+-------+---+-------------+
only showing top 20 rows



### Problem 2: Filtering Rows based on Condition
Problem: Find the users who are older than 30 years from the occupation DataFrame.

In [213]:
from pyspark.sql.functions import col,when,lit
result = occupation.select('*').filter(col('age') > 30)
result.show()


+-------+---+------+-------------+--------+
|user_id|age|gender|   occupation|zip_code|
+-------+---+------+-------------+--------+
|      2| 53|     F|        other|   94043|
|      5| 33|     F|        other|   15213|
|      6| 42|     M|    executive|   98101|
|      7| 57|     M|administrator|   91344|
|      8| 36|     M|administrator|   05201|
|     10| 53|     M|       lawyer|   90703|
|     11| 39|     F|        other|   30329|
|     13| 47|     M|     educator|   29206|
|     14| 45|     M|    scientist|   55106|
|     15| 49|     F|     educator|   97301|
|     18| 35|     F|        other|   37212|
|     19| 40|     M|    librarian|   02138|
|     20| 42|     F|    homemaker|   95660|
|     25| 39|     M|     engineer|   55107|
|     26| 49|     M|     engineer|   21044|
|     27| 40|     F|    librarian|   30030|
|     28| 32|     M|       writer|   55369|
|     29| 41|     M|   programmer|   94043|
|     34| 38|     F|administrator|   42141|
|     39| 41|     M|entertainmen

### Problem 3: Counting and Grouping
Problem: Count the number of users in each occupation from the occupation DataFrame.

In [214]:
result = occupation.groupBy('occupation').count()

result.show()

+-------------+-----+
|   occupation|count|
+-------------+-----+
|    librarian|   51|
|      retired|   14|
|       lawyer|   12|
|         none|    9|
|       writer|   45|
|   programmer|   66|
|    marketing|   26|
|        other|  105|
|    executive|   32|
|    scientist|   31|
|      student|  196|
|     salesman|   12|
|       artist|   28|
|   technician|   27|
|administrator|   79|
|     engineer|   67|
|   healthcare|   16|
|     educator|   95|
|entertainment|   18|
|    homemaker|    7|
+-------------+-----+
only showing top 20 rows



### Problem 4: Adding a New Column
Problem: Add a new column "age_group" to the occupation DataFrame based on the age of the users. Divide users into age groups: "18-25", "26-35", "36-50", and "51+".

In [215]:
occupation = occupation.withColumn("age_group",
                                 when( (col('age')>18) & (col('age')<=25)  ,"18-25")
                                .when((col('age')>25) &  (col('age')<=35) ,"26-35" ) 
                                .when((col('age')>36) &  (col('age')<=50),"36-50" )
                                .when((col('age')>50),  "51+"))

In [216]:
#for future use
occupation.createGlobalTempView("occupation_glob")

In [217]:
occupation.show()

+-------+---+------+-------------+--------+---------+
|user_id|age|gender|   occupation|zip_code|age_group|
+-------+---+------+-------------+--------+---------+
|      1| 24|     M|   technician|   85711|    18-25|
|      2| 53|     F|        other|   94043|      51+|
|      3| 23|     M|       writer|   32067|    18-25|
|      4| 24|     M|   technician|   43537|    18-25|
|      5| 33|     F|        other|   15213|    26-35|
|      6| 42|     M|    executive|   98101|    36-50|
|      7| 57|     M|administrator|   91344|      51+|
|      8| 36|     M|administrator|   05201|     null|
|      9| 29|     M|      student|   01002|    26-35|
|     10| 53|     M|       lawyer|   90703|      51+|
|     11| 39|     F|        other|   30329|    36-50|
|     12| 28|     F|        other|   06405|    26-35|
|     13| 47|     M|     educator|   29206|    36-50|
|     14| 45|     M|    scientist|   55106|    36-50|
|     15| 49|     F|     educator|   97301|    36-50|
|     16| 21|     M|entertai

### Problem 5: Creating DataFrames and Converting to Spark Types
Problem: Given the provided code snippet, create a DataFrame df using the given data and schema. The schema includes columns for firstname, middlename, lastname, id, gender, and salary. After creating the DataFrame, print its schema and display its content without truncation.

In [218]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType


In [219]:
 

#Define the schema using StructType and StructField
schema = StructType([
    StructField("firstname", StringType(), True),
    StructField("middlename", StringType(), True),
    StructField("lastname", StringType(), True),
    StructField("id", StringType(),True),
    StructField("gender", StringType(),True),
    StructField("salary",IntegerType(),True)
])

# Create a list of tuples representing the data
data = [
    ("James", "", "Smith","36636","M",3000),
    ("Michael", "Rose", "","40028","M",4000),
    ("Robert", "", "Williams","42114","M",4000),
    ("Maria", "Anne", "Jones","39192","F",4000),
    ("Jen", "Mary", "Brown","","F",-1)
]

# Create a DataFrame from the data and the defined schema
df = spark.createDataFrame(data, schema)

# Show the DataFrame
df.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



### Problem 6: Adding and Renaming Columns
Problem: Add a new column "gender" to the existing DataFrame and rename the "Age" column to "Years".

In [220]:
#this view is used in addditional problems
occupation.createOrReplaceTempView("occupation_view_2")

occupation = occupation.withColumn('gender',lit("Unknown"))
occupation = occupation.withColumnRenamed('Age',"Years" )


In [227]:
occupation.show()

+-------+-----+-------+-------------+--------+---------+
|user_id|Years| gender|   occupation|zip_code|age_group|
+-------+-----+-------+-------------+--------+---------+
|    481|   73|Unknown|      retired|   37771|      51+|
|    767|   70|Unknown|     engineer|   00000|      51+|
|    803|   70|Unknown|administrator|   78212|      51+|
|    860|   70|Unknown|      retired|   48322|      51+|
|    559|   69|Unknown|    executive|   10022|      51+|
|    585|   69|Unknown|    librarian|   98501|      51+|
|    349|   68|Unknown|      retired|   61455|      51+|
|    573|   68|Unknown|      retired|   48911|      51+|
|    211|   66|Unknown|     salesman|   32605|      51+|
|    318|   65|Unknown|      retired|   06518|      51+|
|    564|   65|Unknown|      retired|   94591|      51+|
|    651|   65|Unknown|      retired|   02903|      51+|
|    423|   64|Unknown|        other|   91606|      51+|
|    845|   64|Unknown|       doctor|   97405|      51+|
|    364|   63|Unknown|     eng

### Problem 7: Filtering Rows and Sorting
Problem: Filter out users who are younger than 30 years and sort the DataFrame by age in descending order.

In [228]:
occupation = occupation.filter(col('Years')>30).orderBy('Years',ascending = False)

In [229]:
occupation.show()

+-------+-----+-------+-------------+--------+---------+
|user_id|Years| gender|   occupation|zip_code|age_group|
+-------+-----+-------+-------------+--------+---------+
|    481|   73|Unknown|      retired|   37771|      51+|
|    767|   70|Unknown|     engineer|   00000|      51+|
|    803|   70|Unknown|administrator|   78212|      51+|
|    860|   70|Unknown|      retired|   48322|      51+|
|    559|   69|Unknown|    executive|   10022|      51+|
|    585|   69|Unknown|    librarian|   98501|      51+|
|    349|   68|Unknown|      retired|   61455|      51+|
|    573|   68|Unknown|      retired|   48911|      51+|
|    211|   66|Unknown|     salesman|   32605|      51+|
|    318|   65|Unknown|      retired|   06518|      51+|
|    564|   65|Unknown|      retired|   94591|      51+|
|    651|   65|Unknown|      retired|   02903|      51+|
|    423|   64|Unknown|        other|   91606|      51+|
|    845|   64|Unknown|       doctor|   97405|      51+|
|    364|   63|Unknown|     eng

### Problem 8: Repartitioning and Collecting Rows
Problem: Repartition the DataFrame into 2 partitions without shuffling the data, then collect and display all rows in the driver and print number of partitions

In [230]:
df_repart= df.repartition(2)
rows = df_repart.collect()

for row in rows:
    print(row)


Row(firstname='Michael', middlename='Rose', lastname='', id='40028', gender='M', salary=4000)
Row(firstname='Robert', middlename='', lastname='Williams', id='42114', gender='M', salary=4000)
Row(firstname='Maria', middlename='Anne', lastname='Jones', id='39192', gender='F', salary=4000)
Row(firstname='James', middlename='', lastname='Smith', id='36636', gender='M', salary=3000)
Row(firstname='Jen', middlename='Mary', lastname='Brown', id='', gender='F', salary=-1)


In [231]:
print ("Number of Partitions: ",df_repart.rdd.getNumPartitions())

Number of Partitions:  2


### Additional questions:

Use both spark SQL and Pyspark to obtain answer wherever relevant

#### Filter out rows where the age is greater than 30 and create a new DataFrame. Then, add a new column named "is_elderly" with a value of "True" for these rows and "False" otherwise.Rename the "gender" column to "sex".

In [232]:
# Spark SQL
occupation.createOrReplaceTempView("occupation_view")
filtered_occupation =spark.sql("select * from occupation_view where years>30 ") 
filtered_occupation = filtered_occupation.withColumnRenamed("gender","sex")
filtered_occupation.show()

+-------+-----+-------+-------------+--------+---------+
|user_id|Years|    sex|   occupation|zip_code|age_group|
+-------+-----+-------+-------------+--------+---------+
|    481|   73|Unknown|      retired|   37771|      51+|
|    767|   70|Unknown|     engineer|   00000|      51+|
|    803|   70|Unknown|administrator|   78212|      51+|
|    860|   70|Unknown|      retired|   48322|      51+|
|    559|   69|Unknown|    executive|   10022|      51+|
|    585|   69|Unknown|    librarian|   98501|      51+|
|    349|   68|Unknown|      retired|   61455|      51+|
|    573|   68|Unknown|      retired|   48911|      51+|
|    211|   66|Unknown|     salesman|   32605|      51+|
|    318|   65|Unknown|      retired|   06518|      51+|
|    564|   65|Unknown|      retired|   94591|      51+|
|    651|   65|Unknown|      retired|   02903|      51+|
|    423|   64|Unknown|        other|   91606|      51+|
|    845|   64|Unknown|       doctor|   97405|      51+|
|    364|   63|Unknown|     eng

In [233]:
filtered_occupation= filtered_occupation.withColumn("is_elderly",col('Years')>30)
filtered_occupation.show() 

+-------+-----+-------+-------------+--------+---------+----------+
|user_id|Years|    sex|   occupation|zip_code|age_group|is_elderly|
+-------+-----+-------+-------------+--------+---------+----------+
|    481|   73|Unknown|      retired|   37771|      51+|      true|
|    767|   70|Unknown|     engineer|   00000|      51+|      true|
|    803|   70|Unknown|administrator|   78212|      51+|      true|
|    860|   70|Unknown|      retired|   48322|      51+|      true|
|    559|   69|Unknown|    executive|   10022|      51+|      true|
|    585|   69|Unknown|    librarian|   98501|      51+|      true|
|    349|   68|Unknown|      retired|   61455|      51+|      true|
|    573|   68|Unknown|      retired|   48911|      51+|      true|
|    211|   66|Unknown|     salesman|   32605|      51+|      true|
|    318|   65|Unknown|      retired|   06518|      51+|      true|
|    564|   65|Unknown|      retired|   94591|      51+|      true|
|    651|   65|Unknown|      retired|   02903|  

In [234]:
# Pyspark
filtered_occ = occupation.filter(col("age") > 30)
filtered_occ = filtered_occ.withColumnRenamed("gender","sex")
filtered_occ = filtered_occ.withColumn('is_elderly',col('Years')>30)
filtered_occ.show()




+-------+-----+-------+-------------+--------+---------+----------+
|user_id|Years|    sex|   occupation|zip_code|age_group|is_elderly|
+-------+-----+-------+-------------+--------+---------+----------+
|    481|   73|Unknown|      retired|   37771|      51+|      true|
|    767|   70|Unknown|     engineer|   00000|      51+|      true|
|    803|   70|Unknown|administrator|   78212|      51+|      true|
|    860|   70|Unknown|      retired|   48322|      51+|      true|
|    559|   69|Unknown|    executive|   10022|      51+|      true|
|    585|   69|Unknown|    librarian|   98501|      51+|      true|
|    349|   68|Unknown|      retired|   61455|      51+|      true|
|    573|   68|Unknown|      retired|   48911|      51+|      true|
|    211|   66|Unknown|     salesman|   32605|      51+|      true|
|    318|   65|Unknown|      retired|   06518|      51+|      true|
|    564|   65|Unknown|      retired|   94591|      51+|      true|
|    651|   65|Unknown|      retired|   02903|  

#### Calculate the average age of male and female users separately. Present the result in a new DataFrame with columns "gender" and "avg_age".

In [235]:
# Spark SQL

query =""" 
            select gender,
                    avg(age) as avg_age 
            from 
                    occupation_view_2
            group by 
                    gender
                  
"""


updated_df = spark.sql(query)
updated_df.show()


query =""" 
            select gender,
                    avg(age) as avg_age 
            from 
                    occupation_view_2
            group by 
                    gender
                  
"""


updated_df = spark.sql(query)
updated_df.show()

+------+------------------+
|gender|           avg_age|
+------+------------------+
|     F| 33.81318681318681|
|     M|34.149253731343286|
+------+------------------+

+------+------------------+
|gender|           avg_age|
+------+------------------+
|     F| 33.81318681318681|
|     M|34.149253731343286|
+------+------------------+



In [208]:
# Pyspark
from pyspark.sql.functions import avg
avg_age_gender = occupation_2.groupBy("gender").agg(avg(col("age")).alias("avg_age"))

ImportError: cannot import name 'alias' from 'pyspark.sql.functions' (/opt/spark/python/pyspark/sql/functions.py)

#### Add a new column named "full_name" to the dataset by concatenating the "user_id" and "occupation" columns. Then, rename the "zip_code" column to "postal_code" in the same DataFrame.

In [236]:
# Spark SQL
occupation.createOrReplaceTempView("occupation_view")
query = """    
        select user_id,
               Years,
               gender,
               occupation,
               age_group, 
              concat(user_id,'-',occupation) as full_name,
              zip_code as postal_code
        from occupation_view
"""

new_df = spark.sql(query)
new_df.show()


+-------+-----+-------+-------------+---------+-----------------+-----------+
|user_id|Years| gender|   occupation|age_group|        full_name|postal_code|
+-------+-----+-------+-------------+---------+-----------------+-----------+
|    481|   73|Unknown|      retired|      51+|      481-retired|      37771|
|    767|   70|Unknown|     engineer|      51+|     767-engineer|      00000|
|    803|   70|Unknown|administrator|      51+|803-administrator|      78212|
|    860|   70|Unknown|      retired|      51+|      860-retired|      48322|
|    559|   69|Unknown|    executive|      51+|    559-executive|      10022|
|    585|   69|Unknown|    librarian|      51+|    585-librarian|      98501|
|    349|   68|Unknown|      retired|      51+|      349-retired|      61455|
|    573|   68|Unknown|      retired|      51+|      573-retired|      48911|
|    211|   66|Unknown|     salesman|      51+|     211-salesman|      32605|
|    318|   65|Unknown|      retired|      51+|      318-retired

In [143]:
# Pyspark
from pyspark.sql.functions import concat

updated_df = occupation.withColumn("full_name", concat(col("user_id"), col("occupation"))) \
                        .withColumnRenamed("zip_code","postal_code")

updated_df.show()



+-------+-----+-------+-------------+-----------+---------+----------------+
|user_id|Years| gender|   occupation|postal_code|age_group|       full_name|
+-------+-----+-------+-------------+-----------+---------+----------------+
|    481|   73|Unknown|      retired|      37771|      51+|      481retired|
|    767|   70|Unknown|     engineer|      00000|      51+|     767engineer|
|    803|   70|Unknown|administrator|      78212|      51+|803administrator|
|    860|   70|Unknown|      retired|      48322|      51+|      860retired|
|    559|   69|Unknown|    executive|      10022|      51+|    559executive|
|    585|   69|Unknown|    librarian|      98501|      51+|    585librarian|
|    349|   68|Unknown|      retired|      61455|      51+|      349retired|
|    573|   68|Unknown|      retired|      48911|      51+|      573retired|
|    211|   66|Unknown|     salesman|      32605|      51+|     211salesman|
|    318|   65|Unknown|      retired|      06518|      51+|      318retired|

#### Filter out rows where occupation is 'technician', select only the "user_id" and "age" columns, and then add a new column "age_diff" that calculates the difference between the user's age and the average age in the dataset.

In [237]:
# Spark SQL
from pyspark.sql.functions import avg
occupation.createOrReplaceTempView("occupation_view")
query = """  with cte as (  
            select *  ,
                 avg(Years)  over() as avg_years 
                   
            from 
                occupation_view
            where 
                occupation = 'technician'
                )
          select *,
                 round((cast(Years as float) - avg_years),2) as diff
          from 
                 cte
"""

new_df = spark.sql(query)
new_df.show()


+-------+-----+-------+----------+--------+---------+-----------------+-----+
|user_id|Years| gender|occupation|zip_code|age_group|        avg_years| diff|
+-------+-----+-------+----------+--------+---------+-----------------+-----+
|    197|   55|Unknown|technician|   75094|      51+|41.53846153846154|13.46|
|    441|   50|Unknown|technician|   55013|    36-50|41.53846153846154| 8.46|
|    325|   48|Unknown|technician|   02139|    36-50|41.53846153846154| 6.46|
|    488|   48|Unknown|technician|   21012|    36-50|41.53846153846154| 6.46|
|    458|   47|Unknown|technician|   Y1A6B|    36-50|41.53846153846154| 5.46|
|    143|   42|Unknown|technician|   08832|    36-50|41.53846153846154| 0.46|
|    718|   42|Unknown|technician|   64118|    36-50|41.53846153846154| 0.46|
|    938|   38|Unknown|technician|   55038|    36-50|41.53846153846154|-3.54|
|    738|   35|Unknown|technician|   95403|    26-35|41.53846153846154|-6.54|
|    739|   35|Unknown|technician|   73162|    26-35|41.53846153

23/08/31 00:25:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/08/31 00:25:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/08/31 00:25:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/08/31 00:25:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/08/31 00:25:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/08/31 00:25:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/08/31 0

In [200]:
# Pyspark
filtered_df = occupation.filter(col("occupation") == "technician")

avg_age = occupation.select(avg("Years")).first()[0]

filtered_df = filtered_df.withColumn("age_diff", col("Years")- avg_age) \
                        .select("user_id","Years","age_diff")

filtered_df.show()

43.36363636363637
+-------+-----+-------------------+
|user_id|Years|           age_diff|
+-------+-----+-------------------+
|    197|   55| 11.636363636363633|
|    441|   50|  6.636363636363633|
|    325|   48|  4.636363636363633|
|    488|   48|  4.636363636363633|
|    458|   47|  3.636363636363633|
|    143|   42|-1.3636363636363669|
|    718|   42|-1.3636363636363669|
|    938|   38| -5.363636363636367|
|    738|   35| -8.363636363636367|
|    739|   35| -8.363636363636367|
|    294|   34| -9.363636363636367|
|    850|   34| -9.363636363636367|
|    311|   32|-11.363636363636367|
+-------+-----+-------------------+



#### Divide the dataset into two DataFrames: one with male users and another with female users. Repartition both DataFrames to have 2 partitions each. Then, union these two DataFrames together and display the resulting DataFrame.

In [245]:
query = """   
         select *
         from 
            occupation_view_2
         where 
            gender= 'M'
"""

query2 = """   
         select *
         from 
            occupation_view_2
         where 
            gender= 'F'
"""

df_m= spark.sql(query)
df_f = spark.sql(query2)
df_m.show()


+-------+---+------+-------------+--------+---------+
|user_id|age|gender|   occupation|zip_code|age_group|
+-------+---+------+-------------+--------+---------+
|      1| 24|     M|   technician|   85711|    18-25|
|      3| 23|     M|       writer|   32067|    18-25|
|      4| 24|     M|   technician|   43537|    18-25|
|      6| 42|     M|    executive|   98101|    36-50|
|      7| 57|     M|administrator|   91344|      51+|
|      8| 36|     M|administrator|   05201|     null|
|      9| 29|     M|      student|   01002|    26-35|
|     10| 53|     M|       lawyer|   90703|      51+|
|     13| 47|     M|     educator|   29206|    36-50|
|     14| 45|     M|    scientist|   55106|    36-50|
|     16| 21|     M|entertainment|   10309|    18-25|
|     17| 30|     M|   programmer|   06355|    26-35|
|     19| 40|     M|    librarian|   02138|    36-50|
|     21| 26|     M|       writer|   30068|    26-35|
|     22| 25|     M|       writer|   40206|    18-25|
|     25| 39|     M|     eng

In [246]:
df_f.show()

+-------+---+------+-------------+--------+---------+
|user_id|age|gender|   occupation|zip_code|age_group|
+-------+---+------+-------------+--------+---------+
|      2| 53|     F|        other|   94043|      51+|
|      5| 33|     F|        other|   15213|    26-35|
|     11| 39|     F|        other|   30329|    36-50|
|     12| 28|     F|        other|   06405|    26-35|
|     15| 49|     F|     educator|   97301|    36-50|
|     18| 35|     F|        other|   37212|    26-35|
|     20| 42|     F|    homemaker|   95660|    36-50|
|     23| 30|     F|       artist|   48197|    26-35|
|     24| 21|     F|       artist|   94533|    18-25|
|     27| 40|     F|    librarian|   30030|    36-50|
|     32| 28|     F|      student|   78741|    26-35|
|     34| 38|     F|administrator|   42141|    36-50|
|     35| 20|     F|    homemaker|   42459|    18-25|
|     36| 19|     F|      student|   93117|    18-25|
|     38| 28|     F|        other|   54467|    26-35|
|     43| 29|     F|    libr

In [247]:
partitioned_df_m = df_m.repartition(2)
partitioned_df_f = df_f.repartition(2)


In [248]:
union_df = partitioned_df_m.union(partitioned_df_f)
union_df.show()

+-------+---+------+-------------+--------+---------+
|user_id|age|gender|   occupation|zip_code|age_group|
+-------+---+------+-------------+--------+---------+
|    800| 25|     M|   programmer|   55337|    18-25|
|    847| 29|     M|      student|   55417|    26-35|
|    440| 30|     M|        other|   48076|    26-35|
|    261| 28|     M|administrator|   85202|    26-35|
|    694| 60|     M|   programmer|   06365|      51+|
|    906| 45|     M|    librarian|   70124|    36-50|
|    806| 27|     M|    marketing|   11217|    26-35|
|    284| 40|     M|    executive|   92629|    36-50|
|    167| 37|     M|        other|   L9G2B|    36-50|
|     74| 39|     M|    scientist|   T8H1N|    36-50|
|    207| 39|     M|    marketing|   92037|    36-50|
|    790| 27|     M|   technician|   80913|    26-35|
|    802| 35|     M|administrator|   34105|    26-35|
|    519| 22|     M|        other|   55320|    18-25|
|    527| 33|     M|    librarian|   12180|    26-35|
|    622| 25|     M|   progr

#### Create and fill a new DataFrame named user_ratings with columns user_id and rating max 10 column. Both user_data and user_ratings share the user_id column. Combine these two DataFrames to create a new DataFrame that includes user information and their corresponding ratings. Make sure to keep only the users present in both DataFrames.

In [249]:
schema = StructType([
    StructField("user_id",  IntegerType(), True),
    StructField("user_ratings", IntegerType(), True)
])

# Create a list of tuples representing the data
data = [
   (2,5),
   (5,1),
   (11,6),
   (12,8),
   (15,3),
   (18,7),
   (800,8),
   (847,4),
   (440,9),
   (261,2)
]

# Create a DataFrame from the data and the defined schema
df_ratings = spark.createDataFrame(data, schema)

df_ratings.show()

+-------+------------+
|user_id|user_ratings|
+-------+------------+
|      2|           5|
|      5|           1|
|     11|           6|
|     12|           8|
|     15|           3|
|     18|           7|
|    800|           8|
|    847|           4|
|    440|           9|
|    261|           2|
+-------+------------+



In [251]:
df_ratings.createOrReplaceTempView("ratings")

query = """    
           select * 
           from 
                occupation_view_2 as o
            inner join
                ratings as r 
            on
                o.user_id = r.user_id
                
"""

combined_df = spark.sql(query)
combined_df.show()

+-------+---+------+-------------+--------+---------+-------+------------+
|user_id|age|gender|   occupation|zip_code|age_group|user_id|user_ratings|
+-------+---+------+-------------+--------+---------+-------+------------+
|      2| 53|     F|        other|   94043|      51+|      2|           5|
|      5| 33|     F|        other|   15213|    26-35|      5|           1|
|     11| 39|     F|        other|   30329|    36-50|     11|           6|
|     12| 28|     F|        other|   06405|    26-35|     12|           8|
|     15| 49|     F|     educator|   97301|    36-50|     15|           3|
|     18| 35|     F|        other|   37212|    26-35|     18|           7|
|    800| 25|     M|   programmer|   55337|    18-25|    800|           8|
|    847| 29|     M|      student|   55417|    26-35|    847|           4|
|    440| 30|     M|        other|   48076|    26-35|    440|           9|
|    261| 28|     M|administrator|   85202|    26-35|    261|           2|
+-------+---+------+-----