At first, create a spark application, read the json file into a dataframe, and explore the data

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("udemy").getOrCreate()

22/10/20 12:53:10 WARN Utils: Your hostname, pallavi-xps resolves to a loopback address: 127.0.1.1; using 192.168.1.79 instead (on interface wlp2s0)
22/10/20 12:53:10 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/20 12:53:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/10/20 12:53:11 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
# read the udemy_courses.json file
udemy_df = spark.read.format('json').load('udemy_courses.json')

In [5]:
#view the data in the dataframe
# udemy_df.show()

In [6]:
#removing corrupt record column

udemy_df = udemy_df.drop('_corrupt_record')

In [7]:
#view the schema of the dataframe
udemy_df.printSchema()

root
 |-- content_duration: string (nullable = true)
 |-- course_id: string (nullable = true)
 |-- course_title: string (nullable = true)
 |-- is_paid: string (nullable = true)
 |-- level: string (nullable = true)
 |-- num_lectures: string (nullable = true)
 |-- num_reviews: string (nullable = true)
 |-- num_subscribers: string (nullable = true)
 |-- price: string (nullable = true)
 |-- published_timestamp: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- url: string (nullable = true)



In [14]:
#all columns have data type string. So, we need to convert the datatypes using cast method

from pyspark.sql.types import StringType, FloatType, IntegerType, BooleanType, TimestampType
from pyspark.sql.functions import col


udemy_df = udemy_df\
            .withColumn('content_duration', col('content_duration').cast(FloatType()))\
            .withColumn('course_id', col('course_id').cast(IntegerType()))\
            .withColumn('course_title', col('course_title').cast(StringType()))\
            .withColumn('is_paid', col('is_paid').cast(BooleanType()))\
            .withColumn('level', col('level').cast(StringType()))\
            .withColumn('num_lectures', col('num_lectures').cast(IntegerType()))\
            .withColumn('num_reviews', col('num_reviews').cast(IntegerType()))\
            .withColumn('num_subscribers', col('num_subscribers').cast(IntegerType()))\
            .withColumn('price', col('price').cast(IntegerType()))\
            .withColumn('published_timestamp', col('published_timestamp').cast(TimestampType()))\
            .withColumn('subject', col('subject').cast(StringType()))\
            .withColumn('url', col('url').cast(StringType()))


# df = df.na.drop(subset=['Region'])
udemy_df.printSchema()

root
 |-- content_duration: float (nullable = true)
 |-- course_id: integer (nullable = true)
 |-- course_title: string (nullable = true)
 |-- is_paid: boolean (nullable = true)
 |-- level: string (nullable = true)
 |-- num_lectures: integer (nullable = true)
 |-- num_reviews: integer (nullable = true)
 |-- num_subscribers: integer (nullable = true)
 |-- price: integer (nullable = true)
 |-- published_timestamp: timestamp (nullable = true)
 |-- subject: string (nullable = true)
 |-- url: string (nullable = true)



In [9]:
# udemy_df.show()

In [13]:
# create a separate dataframe for course (since most of the questions are related to course only)

course_df=udemy_df.select(["course_id","course_title","url","is_paid", "price", "num_subscribers","num_reviews",
                           "num_lectures", "subject","level","content_duration", "subject"
                                  ])

course_df.printSchema()
course_df.show()

root
 |-- course_id: string (nullable = true)
 |-- course_title: string (nullable = true)
 |-- url: string (nullable = true)
 |-- is_paid: string (nullable = true)
 |-- price: string (nullable = true)
 |-- num_subscribers: string (nullable = true)
 |-- num_reviews: string (nullable = true)
 |-- num_lectures: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- level: string (nullable = true)
 |-- content_duration: string (nullable = true)
 |-- subject: string (nullable = true)

+---------+--------------------+--------------------+-------+-----+---------------+-----------+------------+----------------+------------------+------------------+----------------+
|course_id|        course_title|                 url|is_paid|price|num_subscribers|num_reviews|num_lectures|         subject|             level|  content_duration|         subject|
+---------+--------------------+--------------------+-------+-----+---------------+-----------+------------+----------------+--------------

In [11]:
#create SQL table from dataframe

udemy_df.createOrReplaceTempView("udemy_table")

# Questions and Solutions

###   1. What are the best free courses by subject?

In [47]:
from pyspark.sql.functions import row_number, rank, sum as sum_


best_free_courses_df = udemy_df\
                    .groupBy('subject', 'course_title')\
                    .agg(sum_('num_subscribers').alias('Total Number of Subscribers'))\
                    .orderBy('subject')

window_spec = W.partitionBy('subject')\
                .orderBy(col('Total Number of Subscribers').desc())

top_3_best_courses = best_free_courses_df\
                            .withColumn('rn', rank().over(window_spec))\
                            .where('rn <= 1')\
                            .drop('rn')

# regions and their five best item types
top_3_best_courses.select('subject', 'course_title').show()

# final result
top_3_best_courses\
    .withColumn('Best Free Courses', collect_list('course_title').over(W.partitionBy('subject')))\
    .select('subject', 'Best Free Courses')\
    .distinct()\
    .show()

+-------------------+--------------------+
|            subject|        course_title|
+-------------------+--------------------+
|               null|                null|
|   Business Finance|Bitcoin or How I ...|
|     Graphic Design|Photoshop In-Dept...|
|Musical Instruments|Free Beginner Ele...|
|    Web Development|Learn HTML5 Progr...|
+-------------------+--------------------+

+-------------------+--------------------+
|            subject|   Best Free Courses|
+-------------------+--------------------+
|               null|                  []|
|   Business Finance|[Bitcoin or How I...|
|     Graphic Design|[Photoshop In-Dep...|
|Musical Instruments|[Free Beginner El...|
|    Web Development|[Learn HTML5 Prog...|
+-------------------+--------------------+



###     2. What are the most popular courses?

In [None]:
from pyspark.sql.functions import row_number, rank

popular_courses_df = df\
                    .groupBy('Region', 'Item Type')\
                    .agg(sum_('Units Sold').alias('Total Units Sold'))\
                    .orderBy('Region')

window_spec = W.partitionBy('Region')\
                .orderBy(col('Total Units Sold').desc())

region_sales_ranked_df = region_sales_df\
                            .withColumn('rn', rank().over(window_spec))\
                            .where('rn <= 5')\
                            .drop('rn')

# regions and their five best item types
region_sales_ranked_df.select('Region', 'Item Type').show()

# final result
region_sales_ranked_df\
    .withColumn('Item List', collect_list('Item Type').over(W.partitionBy('Region')))\
    .select('Region', 'Item List')\
    .distinct()\
    .show()

In [24]:
popular_courses = spark.sql("""
SELECT a.course_id, a.course_title, a.num_subscribers
FROM dftable a
INNER JOIN (
    SELECT b.course_id, MAX(b.num_subscribers) as No_of_Subscribers
    FROM dftable b
    GROUP BY b.course_id
) b ON a.course_id = b.course_id""")

In [25]:
popular_courses.show()

+---------+--------------------+---------------+
|course_id|        course_title|num_subscribers|
+---------+--------------------+---------------+
|  1070968|Ultimate Investme...|           2147|
|  1113822|Complete GST Cour...|           2792|
|  1006314|Financial Modelin...|           2174|
|  1210588|Beginner to Pro -...|           2451|
|  1011058|How To Maximize Y...|           1276|
|   192870|Trading Penny Sto...|           9221|
|   739964|Investing And Tra...|           1540|
|   403100|Trading Stock Cha...|           2917|
|   476268|Options Trading 3...|           5172|
|  1167710|The Only Investme...|            827|
|   592338|Forex Trading Sec...|           4284|
|   975046|Trading Options W...|           1380|
|   742602|Financial Managem...|           3607|
|   794151|Forex Trading Cou...|           4061|
|  1196544|Python Algo Tradi...|            294|
|   504036|Short Selling: Le...|           2276|
|   719698|Basic Technical A...|           4919|
|   564966|The Compl

###   3. List the courses that are specialized to “Business Finance” and find the average number of subscribers, reviews, price and lectures on the subject.

In [49]:
business_finance_courses = spark.sql("""
                                SELECT course_title courses,
                                avg(num_subscribers) avg_no_of_subscribers,
                                avg(num_reviews) avg_no_of_reviews,
                                avg(price) avg_price,
                                avg(num_lectures) avg_no_of_lectures
                                from dftable                        
                                where subject = "Business Finance"
                                group by course_title
                                """)

business_finance_courses.show()

+--------------------+---------------------+-----------------+---------+------------------+
|             courses|avg_no_of_subscribers|avg_no_of_reviews|avg_price|avg_no_of_lectures|
+--------------------+---------------------+-----------------+---------+------------------+
|Forex SOS Course:...|               1315.0|             48.0|     50.0|              59.0|
|Accounting Standa...|               2222.0|              9.0|     20.0|              12.0|
|Collection of Adv...|               2025.0|             34.0|     40.0|               9.0|
|Buying Call and P...|                266.0|             20.0|     60.0|              14.0|
|MQL4 Tutorial Boo...|               1278.0|             33.0|     30.0|              23.0|
|FMT: Forex Market...|                  1.0|              0.0|     20.0|              27.0|
|        Equity Swaps|                394.0|             25.0|     50.0|              18.0|
|Trade Stocks, For...|                116.0|              6.0|     20.0|        

In [40]:
average_data = spark.sql("""
select avg(num_subscribers) avg_num_of_subscribers, 
avg(num_reviews) avg_num_of_reviews,
avg(price) avg_price,
avg(num_lectures) avg_num_of_lectures
from dftable
where subject = "Business Finance"
""")

average_data.show()

+----------------------+------------------+-----------------+-------------------+
|avg_num_of_subscribers|avg_num_of_reviews|        avg_price|avg_num_of_lectures|
+----------------------+------------------+-----------------+-------------------+
|    1563.7748953974894|  63.5163179916318|68.55230125523012|  32.43096234309623|
+----------------------+------------------+-----------------+-------------------+



###   4. How are courses related?

###  5. Which courses offer the best cost benefit?

###     6. Find the courses which have more than 15 lectures.

In [59]:
courses = spark.sql("""
                    SELECT course_title, num_lectures number_of_lectures
                    FROM dftable
                    WHERE num_lectures > 15
                    ORDER BY num_lectures
                    """)

courses.show()

+-------------------------------------+------------------+
|                         course_title|number_of_lectures|
+-------------------------------------+------------------+
|                 The Big Volatilit...|                16|
|                 Leaps Options Tra...|                16|
|                 Forex Trading Cou...|                16|
|                 Forex Trading For...|                16|
|                 Create A Business...|                16|
|                 Forex how traders...|                16|
|                 Excel functions t...|                16|
|                 Trading for Begin...|                16|
|                 CREDIT SPREAD SUR...|                16|
|                 Accounting Bank R...|                16|
|                 Technical Analysi...|                16|
|                 Trading Biotech S...|                16|
|                 Situational Tradi...|                16|
|                 Win 90% of Trades...|                1

In [58]:
df.filter(col('num_lectures') > 15).select('course_title', 'num_lectures').distinct().show()

+--------------------------------+------------+
|                    course_title|num_lectures|
+--------------------------------+------------+
|            How To Maximize Y...|          26|
|            Learn Accounting....|          33|
|                    Equity Swaps|          18|
|            The Advanced Fore...|          25|
|            Primeros Pasos co...|          31|
|            YOU can Draw, Sha...|          19|
|            Typografie - Die ...|          16|
|            Pic Monkey for Be...|          17|
|            Bluegrass Guitar ...|          16|
|            Escalas Pentatóni...|          23|
|            Learn the Violin ...|          39|
|            Code your first W...|          16|
|            Vue JS - Masterin...|          69|
|            Voiceover Audio S...|          39|
|            Forex For Beginne...|          23|
|            Advanced Technica...|          19|
|Excelを使ってビジネスシミュレ...|          47|
|            Mastering Photosh...|          79|
|   

###     7. List courses on the basis of level.

In [37]:
from pyspark.sql import Window as W
from pyspark.sql.functions import collect_list, collect_set, explode

window_spec = W.partitionBy('level')
course_list_df = df\
                .withColumn('Course List', collect_set('course_title').over(window_spec))\
                .select('level', 'Course List')\
                .distinct()

course_list_df.show()

+------------------+--------------------+
|             level|         Course List|
+------------------+--------------------+
|              null|                  []|
|        All Levels|[Quick Piano Less...|
|    Beginner Level|[First Web Applic...|
|      Expert Level|[Trading Robot Fo...|
|Intermediate Level|[Learn Corel x7 L...|
+------------------+--------------------+



In [None]:
# assuming item type list
from pyspark.sql import Window as W
from pyspark.sql.functions import collect_list, collect_set, explode

window_spec = W.partitionBy('Country')
country_item_list_df = sales_df\
                        .withColumn('Item List', collect_set('Item Type').over(window_spec))\
                        .select('Country', 'Item List')\
                        .distinct()

country_item_list_df.show()

###     8. Find the courses which have duration greater than 2 hours.

In [57]:
courses_duration = spark.sql("""
                                SELECT course_id, course_title, content_duration duration_hours
                                FROM dftable
                                WHERE content_duration > 2.0
                                ORDER BY 3,1
                                """)

courses_duration.show()

+---------+--------------------+--------------+
|course_id|        course_title|duration_hours|
+---------+--------------------+--------------+
|    17349|Figure Drawing Fr...|           2.5|
|    19422|Drupal 7 for Begi...|           2.5|
|    20387|Guitar Insanity W...|           2.5|
|    20461|How to Design a L...|           2.5|
|    26956|Learn Guitar in 2...|           2.5|
|    42197|Options Spreads B...|           2.5|
|    50528|The Complete Word...|           2.5|
|    53157|Beginning Flute L...|           2.5|
|    59535|  HTML for Beginners|           2.5|
|    71912|Have Fun with Beg...|           2.5|
|    98140|Introduction to B...|           2.5|
|    99826|Become A Web Deve...|           2.5|
|   100916|Blues Guitar Less...|           2.5|
|   101498|Blues Guitar Less...|           2.5|
|   101864|Programming for E...|           2.5|
|   109622|Finance - Ratios ...|           2.5|
|   128950|Learn to Build We...|           2.5|
|   131562|    Piano With A Pro|        

In [54]:
df.filter(col('content_duration') > 2.0).select('course_title', 'content_duration').distinct().show()

+---------------------------------------+----------------+
|                           course_title|content_duration|
+---------------------------------------+----------------+
|                   Navigate the 10-K...|             2.5|
|                   Portfolio Managem...|             5.0|
|                    Angular Masterclass|             6.5|
|                   Create a Responsi...|             3.0|
|                   Accounting 2 Simp...|             5.0|
|                   Trader sur le For...|             2.5|
|                   Learn Logo Design...|             3.0|
|                   Piano From Zero T...|             3.5|
|                   Super-Curso de Gu...|            10.5|
|こどもギターレッスン　リトルギタリストⅡ|             3.5|
|                    JavaScript Tutorial|             4.5|
|                   Complete RESPONSI...|             9.0|
|                   Devtools 2017: Be...|             2.5|
|                   WordPress: Create...|             2.5|
|               