At first, create a spark application, read the json file into a dataframe, and explore the data

In [4]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.appName("udemy").getOrCreate()

22/10/19 22:06:26 WARN Utils: Your hostname, pallavi-xps resolves to a loopback address: 127.0.1.1; using 192.168.1.79 instead (on interface wlp2s0)
22/10/19 22:06:26 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/19 22:06:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [10]:
# read the udemy_courses.json file
df = spark.read.format('json').load('udemy_courses.json')

In [11]:
#view the data in the dataframe
df.show()

+---------------+------------------+---------+--------------------+-------+------------------+------------+-----------+---------------+-----+--------------------+----------------+--------------------+
|_corrupt_record|  content_duration|course_id|        course_title|is_paid|             level|num_lectures|num_reviews|num_subscribers|price| published_timestamp|         subject|                 url|
+---------------+------------------+---------+--------------------+-------+------------------+------------+-----------+---------------+-----+--------------------+----------------+--------------------+
|           null|               1.5|  1070968|Ultimate Investme...|   True|        All Levels|          51|         23|           2147|  200|2017-01-18T20:58:58Z|Business Finance|https://www.udemy...|
|           null|              39.0|  1113822|Complete GST Cour...|   True|        All Levels|         274|        923|           2792|   75|2017-03-09T16:34:20Z|Business Finance|https://www.udemy

In [10]:
#view the schema of the dataframe
df.printSchema()

root
 |-- _corrupt_record: string (nullable = true)
 |-- content_duration: string (nullable = true)
 |-- course_id: string (nullable = true)
 |-- course_title: string (nullable = true)
 |-- is_paid: string (nullable = true)
 |-- level: string (nullable = true)
 |-- num_lectures: string (nullable = true)
 |-- num_reviews: string (nullable = true)
 |-- num_subscribers: string (nullable = true)
 |-- price: string (nullable = true)
 |-- published_timestamp: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- url: string (nullable = true)



In [29]:
#all columns have data type string. So, we need to convert the datatypes using cast method

from pyspark.sql.types import StringType, FloatType, IntegerType, BooleanType, TimestampType
from pyspark.sql.functions import col


df = df\
            .withColumn('_corrupt_record', col('_corrupt_record').cast(StringType()))\
            .withColumn('content_duration', col('content_duration').cast(FloatType()))\
            .withColumn('course_id', col('course_id').cast(IntegerType()))\
            .withColumn('course_title', col('course_title').cast(StringType()))\
            .withColumn('is_paid', col('is_paid').cast(BooleanType()))\
            .withColumn('level', col('level').cast(StringType()))\
            .withColumn('num_lectures', col('num_lectures').cast(IntegerType()))\
            .withColumn('num_reviews', col('num_reviews').cast(IntegerType()))\
            .withColumn('num_subscribers', col('num_subscribers').cast(IntegerType()))\
            .withColumn('price', col('price').cast(IntegerType()))\
            .withColumn('published_timestamp', col('published_timestamp').cast(TimestampType()))\
            .withColumn('subject', col('subject').cast(StringType()))\
            .withColumn('url', col('url').cast(StringType()))


# df = df.na.drop(subset=['Region'])
df.printSchema()

root
 |-- _corrupt_record: string (nullable = true)
 |-- content_duration: float (nullable = true)
 |-- course_id: integer (nullable = true)
 |-- course_title: string (nullable = true)
 |-- is_paid: boolean (nullable = true)
 |-- level: string (nullable = true)
 |-- num_lectures: integer (nullable = true)
 |-- num_reviews: integer (nullable = true)
 |-- num_subscribers: integer (nullable = true)
 |-- price: integer (nullable = true)
 |-- published_timestamp: timestamp (nullable = true)
 |-- subject: string (nullable = true)
 |-- url: string (nullable = true)



In [30]:
df.show()

+---------------+----------------+---------+--------------------+-------+------------------+------------+-----------+---------------+-----+-------------------+----------------+--------------------+
|_corrupt_record|content_duration|course_id|        course_title|is_paid|             level|num_lectures|num_reviews|num_subscribers|price|published_timestamp|         subject|                 url|
+---------------+----------------+---------+--------------------+-------+------------------+------------+-----------+---------------+-----+-------------------+----------------+--------------------+
|           null|             1.5|  1070968|Ultimate Investme...|   true|        All Levels|          51|         23|           2147|  200|2017-01-19 02:43:58|Business Finance|https://www.udemy...|
|           null|            39.0|  1113822|Complete GST Cour...|   true|        All Levels|         274|        923|           2792|   75|2017-03-09 22:19:20|Business Finance|https://www.udemy...|
|         

In [32]:
#create SQL table from dataframe

df.createOrReplaceTempView("dftable")

# Questions and Solutions

###   1. What are the best free courses by subject?

In [19]:
best_free_courses = spark.sql("""
                                SELECT subject, course_title from dftable
                                
                                 """)

AnalysisException: expression 'dftable.course_id' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get.;
Aggregate [subject#19], [subject#19, course_id#10, course_title#11, num_reviews#15]
+- SubqueryAlias dftable
   +- View (`dftable`, [_corrupt_record#8,content_duration#9,course_id#10,course_title#11,is_paid#12,level#13,num_lectures#14,num_reviews#15,num_subscribers#16,price#17,published_timestamp#18,subject#19,url#20])
      +- Relation [_corrupt_record#8,content_duration#9,course_id#10,course_title#11,is_paid#12,level#13,num_lectures#14,num_reviews#15,num_subscribers#16,price#17,published_timestamp#18,subject#19,url#20] json


###     2. What are the most popular courses?

In [None]:
select course_from dftable where 

In [24]:
popular_courses = spark.sql("""
SELECT a.course_id, a.course_title, a.num_subscribers
FROM dftable a
INNER JOIN (
    SELECT b.course_id, MAX(b.num_subscribers) as No_of_Subscribers
    FROM dftable b
    GROUP BY b.course_id
) b ON a.course_id = b.course_id""")

In [25]:
popular_courses.show()

+---------+--------------------+---------------+
|course_id|        course_title|num_subscribers|
+---------+--------------------+---------------+
|  1070968|Ultimate Investme...|           2147|
|  1113822|Complete GST Cour...|           2792|
|  1006314|Financial Modelin...|           2174|
|  1210588|Beginner to Pro -...|           2451|
|  1011058|How To Maximize Y...|           1276|
|   192870|Trading Penny Sto...|           9221|
|   739964|Investing And Tra...|           1540|
|   403100|Trading Stock Cha...|           2917|
|   476268|Options Trading 3...|           5172|
|  1167710|The Only Investme...|            827|
|   592338|Forex Trading Sec...|           4284|
|   975046|Trading Options W...|           1380|
|   742602|Financial Managem...|           3607|
|   794151|Forex Trading Cou...|           4061|
|  1196544|Python Algo Tradi...|            294|
|   504036|Short Selling: Le...|           2276|
|   719698|Basic Technical A...|           4919|
|   564966|The Compl

###   3. List the courses that are specialized to “Business Finance” and find the average number of subscribers, reviews, price and lectures on the subject.

In [49]:
business_finance_courses = spark.sql("""
                                SELECT course_title courses,
                                avg(num_subscribers) avg_no_of_subscribers,
                                avg(num_reviews) avg_no_of_reviews,
                                avg(price) avg_price,
                                avg(num_lectures) avg_no_of_lectures
                                from dftable                        
                                where subject = "Business Finance"
                                group by course_title
                                """)

business_finance_courses.show()

+--------------------+---------------------+-----------------+---------+------------------+
|             courses|avg_no_of_subscribers|avg_no_of_reviews|avg_price|avg_no_of_lectures|
+--------------------+---------------------+-----------------+---------+------------------+
|Forex SOS Course:...|               1315.0|             48.0|     50.0|              59.0|
|Accounting Standa...|               2222.0|              9.0|     20.0|              12.0|
|Collection of Adv...|               2025.0|             34.0|     40.0|               9.0|
|Buying Call and P...|                266.0|             20.0|     60.0|              14.0|
|MQL4 Tutorial Boo...|               1278.0|             33.0|     30.0|              23.0|
|FMT: Forex Market...|                  1.0|              0.0|     20.0|              27.0|
|        Equity Swaps|                394.0|             25.0|     50.0|              18.0|
|Trade Stocks, For...|                116.0|              6.0|     20.0|        

In [40]:
average_data = spark.sql("""
select avg(num_subscribers) avg_num_of_subscribers, 
avg(num_reviews) avg_num_of_reviews,
avg(price) avg_price,
avg(num_lectures) avg_num_of_lectures
from dftable
where subject = "Business Finance"
""")

average_data.show()

+----------------------+------------------+-----------------+-------------------+
|avg_num_of_subscribers|avg_num_of_reviews|        avg_price|avg_num_of_lectures|
+----------------------+------------------+-----------------+-------------------+
|    1563.7748953974894|  63.5163179916318|68.55230125523012|  32.43096234309623|
+----------------------+------------------+-----------------+-------------------+



###   4. How are courses related?

###  5. Which courses offer the best cost benefit?

###     6. Find the courses which have more than 15 lectures.

In [54]:
courses = spark.sql("""
                    SELECT course_title, num_lectures number_of_lectures
                    FROM dftable
                    WHERE num_lectures > 15
                    ORDER BY num_lectures
                    """)

courses.show()

+--------------------+------------------+
|        course_title|number_of_lectures|
+--------------------+------------------+
|JavaScript Fundam...|               100|
|Master Flute Play...|               100|
|Curso Avanzado de...|               100|
|   Learning Drupal 8|               101|
|Learning CSS3 - A...|               101|
|Financial Account...|               101|
|Acoustic Blues Gu...|               101|
|Projects in Expre...|               102|
|Hedge and Mutual ...|               102|
|CFA® Level 1 2014...|               102|
|Master the Bassoo...|               102|
|Professional Pyth...|               102|
|Learn Fun Dreamy ...|               102|
|Python for Financ...|               103|
|Corel Draw X7 Eği...|               103|
|BDD with Ruby on ...|               103|
|Master EmberJS : ...|               103|
|Easy Piano for Ki...|               104|
|Guitar Super Syst...|               104|
|The Complete Guid...|               104|
+--------------------+------------

###     7. List courses on the basis of level.

In [37]:
from pyspark.sql import Window as W
from pyspark.sql.functions import collect_list, collect_set, explode

window_spec = W.partitionBy('level')
course_list_df = df\
                .withColumn('Course List', collect_set('course_title').over(window_spec))\
                .select('level', 'Course List')\
                .distinct()

course_list_df.show()

+------------------+--------------------+
|             level|         Course List|
+------------------+--------------------+
|              null|                  []|
|        All Levels|[Quick Piano Less...|
|    Beginner Level|[First Web Applic...|
|      Expert Level|[Trading Robot Fo...|
|Intermediate Level|[Learn Corel x7 L...|
+------------------+--------------------+



In [None]:
# assuming item type list
from pyspark.sql import Window as W
from pyspark.sql.functions import collect_list, collect_set, explode

window_spec = W.partitionBy('Country')
country_item_list_df = sales_df\
                        .withColumn('Item List', collect_set('Item Type').over(window_spec))\
                        .select('Country', 'Item List')\
                        .distinct()

country_item_list_df.show()

###     8. Find the courses which have duration greater than 2 hours.

In [68]:
courses_duration = spark.sql("""
                                SELECT course_id, course_title, content_duration duration_hours
                                FROM dftable
                                WHERE content_duration > "2.0"
                                ORDER BY 3
                                """)

courses_duration.show()

+---------+--------------------+--------------+
|course_id|        course_title|duration_hours|
+---------+--------------------+--------------+
|   364810|Double your Forex...|           2.5|
|   743914|Value Investing F...|           2.5|
|   637922|Venture Capital. ...|           2.5|
|   403100|Trading Stock Cha...|           2.5|
|   876646|Direction-Indepen...|           2.5|
|  1170894|Python Algo Stock...|           2.5|
|   244000|Forex Trading: Th...|           2.5|
|   401784|Options Trading I...|           2.5|
|   831940|Fundamentals of A...|           2.5|
|   421054|Auditing Basics (...|           2.5|
|   358062|Financial Modelin...|           2.5|
|   538540|Forex Trading vs ...|           2.5|
|  1209694|Become A Forex Tr...|           2.5|
|   252396|How to Pick The R...|           2.5|
|   891490|Accounting Superp...|           2.5|
|  1103590|Trading con retro...|           2.5|
|  1006314|Financial Modelin...|           2.5|
|   507486|Technical Analysi...|        