#### data: udemy_courses.json
#### questions: udemy_questions.docx
#### notebook: udemy_task_solutions.ipynb
#### python file: udemy_task_solutions.py

At first, create a spark application, read the json file into a dataframe, and explore the data

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("udemy").getOrCreate()

22/10/20 16:39:46 WARN Utils: Your hostname, pallavi-xps resolves to a loopback address: 127.0.1.1; using 192.168.1.79 instead (on interface wlp2s0)
22/10/20 16:39:46 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/20 16:39:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/10/20 16:39:47 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/10/20 16:39:47 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [4]:
# read the udemy_courses.json file
udemy_df = spark.read.format('json').load('udemy_courses.json')

In [5]:
#view the data in the dataframe
# udemy_df.show()

In [6]:
#removing corrupt record column

udemy_df = udemy_df.drop('_corrupt_record')

In [7]:
#view the schema of the dataframe
udemy_df.printSchema()

root
 |-- content_duration: string (nullable = true)
 |-- course_id: string (nullable = true)
 |-- course_title: string (nullable = true)
 |-- is_paid: string (nullable = true)
 |-- level: string (nullable = true)
 |-- num_lectures: string (nullable = true)
 |-- num_reviews: string (nullable = true)
 |-- num_subscribers: string (nullable = true)
 |-- price: string (nullable = true)
 |-- published_timestamp: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- url: string (nullable = true)



In [8]:
#all columns have data type string. So, we need to convert the datatypes using cast method

from pyspark.sql.types import StringType, FloatType, IntegerType, BooleanType, TimestampType
from pyspark.sql.functions import col


udemy_df = udemy_df\
            .withColumn('content_duration', col('content_duration').cast(FloatType()))\
            .withColumn('course_id', col('course_id').cast(IntegerType()))\
            .withColumn('course_title', col('course_title').cast(StringType()))\
            .withColumn('is_paid', col('is_paid').cast(BooleanType()))\
            .withColumn('level', col('level').cast(StringType()))\
            .withColumn('num_lectures', col('num_lectures').cast(IntegerType()))\
            .withColumn('num_reviews', col('num_reviews').cast(IntegerType()))\
            .withColumn('num_subscribers', col('num_subscribers').cast(IntegerType()))\
            .withColumn('price', col('price').cast(IntegerType()))\
            .withColumn('published_timestamp', col('published_timestamp').cast(TimestampType()))\
            .withColumn('subject', col('subject').cast(StringType()))\
            .withColumn('url', col('url').cast(StringType()))


udemy_df.printSchema()

root
 |-- content_duration: float (nullable = true)
 |-- course_id: integer (nullable = true)
 |-- course_title: string (nullable = true)
 |-- is_paid: boolean (nullable = true)
 |-- level: string (nullable = true)
 |-- num_lectures: integer (nullable = true)
 |-- num_reviews: integer (nullable = true)
 |-- num_subscribers: integer (nullable = true)
 |-- price: integer (nullable = true)
 |-- published_timestamp: timestamp (nullable = true)
 |-- subject: string (nullable = true)
 |-- url: string (nullable = true)



In [9]:
# udemy_df.show()

In [10]:
# create a separate dataframe for course (since most of the questions are related to course only)

course_df=udemy_df.select(["course_id","course_title","url","is_paid", "price", "num_subscribers","num_reviews",
                           "num_lectures", "subject","level","content_duration", "subject"
                         ])

course_df.printSchema()
course_df = course_df.drop_duplicates()
# course_df.show()

root
 |-- course_id: integer (nullable = true)
 |-- course_title: string (nullable = true)
 |-- url: string (nullable = true)
 |-- is_paid: boolean (nullable = true)
 |-- price: integer (nullable = true)
 |-- num_subscribers: integer (nullable = true)
 |-- num_reviews: integer (nullable = true)
 |-- num_lectures: integer (nullable = true)
 |-- subject: string (nullable = true)
 |-- level: string (nullable = true)
 |-- content_duration: float (nullable = true)
 |-- subject: string (nullable = true)



In [11]:
#create SQL table from dataframe

udemy_df.createOrReplaceTempView("udemy_table")
course_df.createOrReplaceTempView("course_table")

# Questions and Solutions

###   1. What are the best free courses by subject?

In [13]:
#assumption: best course = course having maximum reviews

free_courses_df= course_df.filter(course_df.is_paid == "False")

# free_courses_df.show()
# free_courses_df.printSchema()

In [17]:
from pyspark.sql.functions import rank, max, desc, collect_list
from pyspark.sql import Window as W 


best_free_courses_df = free_courses_df\
                    .groupBy('subject', 'course_title')\
                    .agg(max('num_reviews').alias('Number of Reviews'))\
                    .orderBy(desc('Number of Reviews'))


#creating partition by subject in descending order of num_reviews
window_spec = W.partitionBy('subject')\
                .orderBy(desc(col('Number of Reviews')))

#get courses with top 3 rank in each partition
top_3_best_courses = best_free_courses_df\
                            .withColumn('rn', rank().over(window_spec))\
                            .where('rn <= 3')\
                            .drop('rn')

# subjects and their three best courses along with Number of Reviews
top_3_best_courses\
    .withColumn('Best Free Courses', collect_list('course_title').over(W.partitionBy('subject')))\
    .select('subject', 'Best Free Courses', 'Number of Reviews')\
    .distinct()\
    .dropna()\
    .show()

+-------------------+--------------------+-----------------+
|            subject|   Best Free Courses|Number of Reviews|
+-------------------+--------------------+-----------------+
|   Business Finance|[Accounting in 60...|             4397|
|   Business Finance|[Accounting in 60...|             2698|
|   Business Finance|[Accounting in 60...|             1463|
|     Graphic Design|[Graphic Design -...|             2215|
|     Graphic Design|[Graphic Design -...|             1779|
|     Graphic Design|[Graphic Design -...|             1563|
|Musical Instruments|[Getting Started ...|             1141|
|Musical Instruments|[Getting Started ...|             1042|
|Musical Instruments|[Getting Started ...|              772|
|    Web Development|[Learn HTML5 Prog...|             8629|
|    Web Development|[Learn HTML5 Prog...|             6512|
|    Web Development|[Learn HTML5 Prog...|             5924|
+-------------------+--------------------+-----------------+



###     2. What are the most popular courses?

In [18]:
#assumption: num_subscribers indicates popularity


#sql way
# getting top 10 most popular courses

popular_courses = spark.sql("""
                            SELECT course_title AS Course_Titles, num_subscribers AS Number_of_Subscribers
                            FROM course_table
                            ORDER BY 2 DESC
                            LIMIT 10                            
                            """)

popular_courses.show()

+--------------------+---------------------+
|       Course_Titles|Number_of_Subscribers|
+--------------------+---------------------+
|Learn HTML5 Progr...|               268923|
|Coding for Entrep...|               161029|
|The Web Developer...|               121584|
|Build Your First ...|               120291|
|The Complete Web ...|               114512|
|Free Beginner Ele...|               101154|
|Web Design for We...|                98867|
|Learn Javascript ...|                84897|
|Practical PHP: Ma...|                83737|
|JavaScript: Under...|                79612|
+--------------------+---------------------+



In [19]:
#dfway

from pyspark.sql.functions import desc

popular_courses_df = course_df.select(col('course_title').alias("Course_Titles"),\
                                      col('num_subscribers').alias("Number_of_Subscribers"))\
                                      .orderBy(desc(col("num_subscribers")))

popular_courses_df.show(10)

+--------------------+---------------------+
|       Course_Titles|Number_of_Subscribers|
+--------------------+---------------------+
|Learn HTML5 Progr...|               268923|
|Coding for Entrep...|               161029|
|The Web Developer...|               121584|
|Build Your First ...|               120291|
|The Complete Web ...|               114512|
|Free Beginner Ele...|               101154|
|Web Design for We...|                98867|
|Learn Javascript ...|                84897|
|Practical PHP: Ma...|                83737|
|JavaScript: Under...|                79612|
+--------------------+---------------------+
only showing top 10 rows



###   3. List the courses that are specialized to “Business Finance” and find the average number of subscribers, reviews, price and lectures on the subject.

In [20]:
#sql way

#1. create a df business_finance_courses consisting of courses that are specialized to “Business Finance”

business_finance_courses = spark.sql("""
                                        SELECT * FROM course_table
                                        WHERE subject == "Business Finance"                         
                                    """)

# business_finance_courses.show()

In [21]:
#2. calculate average no. of subscribers, reviews, price, and lectures on subject “Business Finance”
#using business_finance_courses df

business_finance_courses.createOrReplaceTempView("business_finance_courses_table")

average_data = spark.sql("""
                            SELECT 
                                avg(num_lectures) `Average Lectures`,
                                avg(num_subscribers) `Average Subscribers`,
                                avg(num_reviews) `Average Reviews`,
                                avg(price) `Average Price`                                
                            FROM business_finance_courses_table
                        """)

average_data.show()

+-----------------+-------------------+------------------+----------------+
| Average Lectures|Average Subscribers|   Average Reviews|   Average Price|
+-----------------+-------------------+------------------+----------------+
|32.46263643996642| 1569.0268681780017|63.729638958858104|68.6943744752309|
+-----------------+-------------------+------------------+----------------+



In [22]:
#dfway

#1. create dataframe business_finance_df containing info of only those courses that are in "Business Finance" subject
business_finance_df = course_df.filter(col('subject')=='Business Finance')
business_finance_df.show(5)

+---------+--------------------+--------------------+-------+-----+---------------+-----------+------------+----------------+--------------+----------------+----------------+
|course_id|        course_title|                 url|is_paid|price|num_subscribers|num_reviews|num_lectures|         subject|         level|content_duration|         subject|
+---------+--------------------+--------------------+-------+-----+---------------+-----------+------------+----------------+--------------+----------------+----------------+
|    24877|Introductory Fina...|https://www.udemy...|   true|   80|           1793|        265|          54|Business Finance|Beginner Level|            10.0|Business Finance|
|   332690|Introduction to T...|https://www.udemy...|   true|   20|             45|          4|           6|Business Finance|Beginner Level|             1.0|Business Finance|
|   521342|Succeed in Forex ...|https://www.udemy...|   true|  200|           1806|         38|         211|Business Finance|

In [23]:
#2. get list of courses in the subject "Business Finance"

courses_business_finance = business_finance_df.select('course_title', 'subject')
courses_business_finance.show(10)

+--------------------+----------------+
|        course_title|         subject|
+--------------------+----------------+
|Introductory Fina...|Business Finance|
|Introduction to T...|Business Finance|
|Succeed in Forex ...|Business Finance|
|2 Easy Steps To I...|Business Finance|
|Level 1 2014 CFA®...|Business Finance|
|The Complete Fore...|Business Finance|
|Beginner to Pro -...|Business Finance|
|Winning Forex Tra...|Business Finance|
|What You Should K...|Business Finance|
|Forex Trading - L...|Business Finance|
+--------------------+----------------+
only showing top 10 rows



In [24]:
#3. calculating average values for the subject

business_finance_df.agg({'num_lectures': 'mean', 'num_subscribers' : 'mean', 'num_reviews': 'mean',\
                         'price': 'mean'}).show()

+----------------+--------------------+------------------+-----------------+
|      avg(price)|avg(num_subscribers)|  avg(num_reviews)|avg(num_lectures)|
+----------------+--------------------+------------------+-----------------+
|68.6943744752309|  1569.0268681780017|63.729638958858104|32.46263643996642|
+----------------+--------------------+------------------+-----------------+



###   4. How are courses related?

In [53]:
#grouping titles by subject

#sqlway
titles_subjects = spark.sql("""
                                SELECT subject, course_title 
                                FROM course_table
                                ORDER BY subject
                            """)

titles_subjects.show()

+----------------+--------------------+
|         subject|        course_title|
+----------------+--------------------+
|            null|                null|
|Business Finance|Stock Options Tra...|
|Business Finance|MBA Accounting an...|
|Business Finance|Value Investing, ...|
|Business Finance|How to Get Growin...|
|Business Finance|Introductory Fina...|
|Business Finance|How to Start Inve...|
|Business Finance|Introduction to T...|
|Business Finance|How To Invest Wit...|
|Business Finance|2 Easy Steps To I...|
|Business Finance|Aprende a inverti...|
|Business Finance|The Complete Fore...|
|Business Finance|SEASON 2 MQL4 Tut...|
|Business Finance|Winning Forex Tra...|
|Business Finance|CFA Level I Works...|
|Business Finance|Forex Trading - L...|
|Business Finance|Succeed in Forex ...|
|Business Finance|Aprende a inverti...|
|Business Finance|Beginner to Pro -...|
|Business Finance|Mutual Funds for ...|
+----------------+--------------------+
only showing top 20 rows



In [52]:
#dfway

titles_subjects_df = course_df.select(col('subject'), col('course_title'))\
                              .dropna()\
                              .distinct()\
                              .orderBy(col('subject'))\
                              .show()

+----------------+--------------------+
|         subject|        course_title|
+----------------+--------------------+
|Business Finance|Improve Your Fina...|
|Business Finance|Take Control By L...|
|Business Finance|The Complete Guid...|
|Business Finance|Black Scholes Opt...|
|Business Finance|Succeed in Lotto ...|
|Business Finance|Learn to Trade Fo...|
|Business Finance|   Business finances|
|Business Finance|Accounting - The ...|
|Business Finance|      TRADING TRUTHS|
|Business Finance|Curso de Day Trad...|
|Business Finance|Forex Trading Sys...|
|Business Finance|Volatility Tradin...|
|Business Finance|Trading Forex  - ...|
|Business Finance|Bitcoin or How I ...|
|Business Finance|The Power Modelin...|
|Business Finance|Naked Put Options...|
|Business Finance|* An Integrated A...|
|Business Finance|Free Options 101 ...|
|Business Finance|Finance - Ratios ...|
|Business Finance|الدروس التعليمية ...|
+----------------+--------------------+
only showing top 20 rows



In [75]:
#dfway 2

from pyspark.sql.functions import collect_set

window_spec = W.partitionBy('subject')
course_relation_df = course_df\
                .withColumn('Course List', collect_set('course_title').over(window_spec))\
                .select('subject', 'Course List')\
                .distinct().dropna()

course_relation_df.show()

+-------------------+--------------------+
|            subject|         Course List|
+-------------------+--------------------+
|   Business Finance|[Learn to Trade F...|
|     Graphic Design|[Sketch 3 - New G...|
|Musical Instruments|[Quick Piano Less...|
|    Web Development|[First Web Applic...|
+-------------------+--------------------+



###  5. Which courses offer the best cost benefit?

In [50]:
import pyspark.sql.functions as f

In [51]:
#Assumption 1: course having price less than 100, reviews more than 5000, and subscribers more than 50000 
#offer the best cost benefit

best_cost_benefit_courses=course_df.filter(f.col("num_subscribers")>50000)\
                                   .filter(f.col("num_reviews")>5000)\
                                   .filter(f.col("price")<500)

best_cost_benefit_courses.show()

+---------+--------------------+--------------------+-------+-----+---------------+-----------+------------+-------------------+--------------+----------------+-------------------+
|course_id|        course_title|                 url|is_paid|price|num_subscribers|num_reviews|num_lectures|            subject|         level|content_duration|            subject|
+---------+--------------------+--------------------+-------+-----+---------------+-----------+------------+-------------------+--------------+----------------+-------------------+
|   473160|Web Design for We...|https://www.udemy...|  false|    0|          98867|       6512|          20|    Web Development|    All Levels|             3.0|    Web Development|
|   756150|Angular 4 (former...|https://www.udemy...|   true|  190|          73783|      19649|         329|    Web Development|    All Levels|            22.0|    Web Development|
|   461160|Learn and Underst...|https://www.udemy...|   true|  195|          58208|      11123|

In [None]:
#Assumption 2: longest yet cheapest course = best cost benefit 
# find courses having longest content_duration with lowest price

In [69]:
#sqlway

more_duration_less_price_courses = spark.sql("""
                                                select course_title, content_duration, price from course_table
                                                order by content_duration desc, price asc                                            
                                            """)

more_duration_less_price_courses.show(10)

+--------------------+----------------+-----+
|        course_title|content_duration|price|
+--------------------+----------------+-----+
|The Complete Figu...|            78.5|   50|
|The Complete Web ...|            76.5|  200|
|Financial Managem...|            71.5|  190|
|TRADER BOT: Intro...|            70.0|   20|
|Anatomy for Figur...|            68.5|   95|
|Modern E-Commerce...|            66.5|   50|
|Discover How to D...|            62.0|   65|
|Advanced Accounti...|            62.0|  150|
|Become a Professi...|            60.0|  200|
|Code & Grow Rich:...|            57.0|  200|
+--------------------+----------------+-----+
only showing top 10 rows



In [70]:
#dfway
from pyspark.sql.functions import asc

course_df.orderBy(desc('content_duration'), asc('price')).select('course_title', 'content_duration', 'price')\
         .show()

+--------------------+----------------+-----+
|        course_title|content_duration|price|
+--------------------+----------------+-----+
|The Complete Figu...|            78.5|   50|
|The Complete Web ...|            76.5|  200|
|Financial Managem...|            71.5|  190|
|TRADER BOT: Intro...|            70.0|   20|
|Anatomy for Figur...|            68.5|   95|
|Modern E-Commerce...|            66.5|   50|
|Discover How to D...|            62.0|   65|
|Advanced Accounti...|            62.0|  150|
|Become a Professi...|            60.0|  200|
|Code & Grow Rich:...|            57.0|  200|
|The Complete Web ...|            51.0|  195|
|Become a Kick-Ass...|            48.5|  200|
|Financial Risk Ma...|            47.0|   50|
|Learn to Trade Th...|            46.5|  200|
|Cost Accounting a...|            45.0|  150|
|Coding for Entrep...|            45.0|  195|
|CSSCasts; CSS lib...|            44.5|  200|
|Back to School We...|            44.5|  200|
|MCA Accountancy a...|            

###     6. Find the courses which have more than 15 lectures.

In [49]:
#sqlway

courses_morethan15 = spark.sql("""
                                SELECT course_title Course, num_lectures `Number of Lectures`
                                FROM course_table
                                WHERE num_lectures > 15
                                ORDER BY num_lectures
                            """)

courses_morethan15.count()
courses_morethan15.show(10)

+--------------------+------------------+
|              Course|Number of Lectures|
+--------------------+------------------+
|Whiteboard Animat...|                16|
|Coding Made Easy:...|                16|
|Technical Analysi...|                16|
|JavaScript Tricks...|                16|
|Blues Guitar Less...|                16|
|Forex Trading - L...|                16|
|Accounting in 60 ...|                16|
|Expert Guide : Ma...|                16|
|Simple Options Tr...|                16|
|CREDIT SPREAD SUR...|                16|
+--------------------+------------------+
only showing top 10 rows



In [48]:
#dfway

course_df.filter(col('num_lectures') > 15).select(col('course_title').alias('Course'),\
                                                  col('num_lectures').alias('Number of Lectures'))\
         .orderBy(col('num_lectures'))\
         .show(10)

+--------------------+------------------+
|              Course|Number of Lectures|
+--------------------+------------------+
|Whiteboard Animat...|                16|
|Coding Made Easy:...|                16|
|Technical Analysi...|                16|
|JavaScript Tricks...|                16|
|Blues Guitar Less...|                16|
|Forex Trading - L...|                16|
|Accounting in 60 ...|                16|
|Expert Guide : Ma...|                16|
|Simple Options Tr...|                16|
|CREDIT SPREAD SUR...|                16|
+--------------------+------------------+
only showing top 10 rows



###     7. List courses on the basis of level.

In [77]:
#dfway

from pyspark.sql.functions import collect_set

window_spec = W.partitionBy('level')
course_list_df = course_df\
                .withColumn('Course List', collect_set('course_title').over(window_spec))\
                .select('level', 'Course List')\
                .distinct().dropna()

course_list_df.show()

+------------------+--------------------+
|             level|         Course List|
+------------------+--------------------+
|        All Levels|[Quick Piano Less...|
|    Beginner Level|[First Web Applic...|
|      Expert Level|[Trading Robot Fo...|
|Intermediate Level|[JavaScript For A...|
+------------------+--------------------+



###     8. Find the courses which have duration greater than 2 hours.

In [46]:
#sqlway

courses_duration = spark.sql("""
                                SELECT course_id, course_title, content_duration `Duration(Hours)`
                                FROM course_table
                                WHERE content_duration > 2.0
                                ORDER BY 3,1
                            """)

courses_duration.show()

+---------+--------------------+---------------+
|course_id|        course_title|Duration(Hours)|
+---------+--------------------+---------------+
|    17349|Figure Drawing Fr...|            2.5|
|    19422|Drupal 7 for Begi...|            2.5|
|    20387|Guitar Insanity W...|            2.5|
|    20461|How to Design a L...|            2.5|
|    26956|Learn Guitar in 2...|            2.5|
|    42197|Options Spreads B...|            2.5|
|    50528|The Complete Word...|            2.5|
|    53157|Beginning Flute L...|            2.5|
|    59535|  HTML for Beginners|            2.5|
|    71912|Have Fun with Beg...|            2.5|
|    98140|Introduction to B...|            2.5|
|    99826|Become A Web Deve...|            2.5|
|   100916|Blues Guitar Less...|            2.5|
|   101498|Blues Guitar Less...|            2.5|
|   101864|Programming for E...|            2.5|
|   109622|Finance - Ratios ...|            2.5|
|   128950|Learn to Build We...|            2.5|
|   131562|    Piano

In [47]:
#dfway

course_df.filter(col('content_duration') > 2.0).select('course_id', 'course_title', 'content_duration')\
         .distinct().orderBy(col('content_duration')).show()

+---------+--------------------+----------------+
|course_id|        course_title|content_duration|
+---------+--------------------+----------------+
|    59535|  HTML for Beginners|             2.5|
|  1266468|Indesign - Para q...|             2.5|
|  1095488|Adobe photoshop T...|             2.5|
|  1075366|Adobe InDesign CC...|             2.5|
|   922600|CSS Web Developme...|             2.5|
|   422546|Piano - Circle of...|             2.5|
|   826912|Six Stage Negotia...|             2.5|
|   521952|Ultimate Guide To...|             2.5|
|   131562|    Piano With A Pro|             2.5|
|  1140532|Explore JavaScrip...|             2.5|
|   774910|WordPress: Create...|             2.5|
|   638040|Adobe Photoshop E...|             2.5|
|  1006314|Financial Modelin...|             2.5|
|   584522|Voiceover Audio S...|             2.5|
|  1258402|Canva Complete Co...|             2.5|
|  1193056|Trader sur le For...|             2.5|
|   463834|Cost Accounting O...|             2.5|
