In [53]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
appName("Sneha Spark Session").\
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [54]:
spark

In [55]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType

In [56]:
user_schema = StructType([
                        StructField("user_id", IntegerType(),nullable = False),
                        StructField("user_first_name", StringType(),nullable = False),
                        StructField("user_last_name", StringType(),nullable = False),
                        StructField("user_email", StringType(),nullable = False),
                        StructField("user_gender", StringType(),nullable = False),
                        StructField("user_phone_numbers", ArrayType(StringType()),nullable=True),
                        StructField("user_address", StructType([
                        StructField("street", StringType(), nullable=False),
                        StructField("city", StringType(), nullable=False),
                        StructField("state", StringType(), nullable=False),
                        StructField("postal_code", StringType(), nullable=False),
                        ]), nullable=False)
                        ])

In [57]:
sms_df = spark.read.format("json").schema(user_schema).load("/public/sms/users")

In [58]:
sms_df.rdd.getNumPartitions()

3

In [59]:
sms_df.count()

1000000

In [60]:
sms_df.limit(2)

user_id,user_first_name,user_last_name,user_email,user_gender,user_phone_numbers,user_address
300001,Abramo,Jaggi,ajaggi0@biglobe.n...,Male,"[7183860953, 2149...",{27777 Loftsgordo...
300002,Honey,Lacase,hlacase1@dedecms.com,Female,"[2039620506, 3129...",{86340 Sherman Al...


In [61]:
from pyspark.sql.functions import col, size

In [62]:
sms_df.withColumn("user_street",col("user_address.street"))\
      .withColumn("user_city",col("user_address.city"))\
      .withColumn("user_state",col("user_address.state"))\
      .withColumn("user_postal_code",col("user_address.postal_code"))\
      .withColumn("num_phn_numbers",size(col("user_phone_numbers"))).createOrReplaceTempView("user_vw")

In [63]:
spark.sql("select count(distinct(user_Id)) as user_cnt from user_vw where user_state = 'New York'").show()

+--------+
|user_cnt|
+--------+
|   49576|
+--------+



In [64]:
spark.sql("select count(distinct(user_postal_code)) as cnt,user_state from user_vw group by user_state order by cnt desc limit(1)")

cnt,user_state
206,California


In [65]:
spark.sql("select count(distinct(user_id)) as cnt, user_city from user_vw limit where user_city is not null group by user_city order by cnt desc limit(1)")

cnt,user_city
28504,Washington


In [69]:
spark.sql("""select count(distinct user_id) as user_cnt from user_vw where user_email like '%bizjournals.com'""")

user_cnt
2015


In [70]:
spark.sql("""select count(distinct user_id) as user_cnt from user_vw where num_phn_numbers = 4""")

user_cnt
179041


In [71]:
spark.sql("""select count(distinct user_id) as user_cnt from user_vw where user_phone_numbers is null""")

user_cnt
108981


In [72]:
sms_df.write.format("parquet").mode("overwrite").option("path","/user/itv017244/week9/assignment").save()

In [75]:
spark.sql("""
        select user_state,sum(male_cnt) as male, sum(female_cnt) as female from 
        (select user_state,
        case when user_gender = 'Male' then count(user_id) end as male_cnt,
        case when user_gender = 'Female' then count(user_id) end as female_cnt
        from user_vw where user_state is not null and user_phone_numbers is not null
        group by user_state,user_gender)
        group by user_state
        order by user_state
""")

user_state,male,female
Alabama,9307,9178
Alaska,1882,1938
Arizona,9406,9543
Arkansas,2420,2416
California,49120,48716
Colorado,10128,10125
Connecticut,5797,5917
Delaware,1651,1654
District of Columbia,14212,14292
Florida,36692,36688
