In [81]:
import requests
import lxml.html as lx
import time

import pandas as pd

## Generic Function to perform HTTP Reuqest

In [77]:
# make a get request on session object and return response
def get_http_response( url, params = {} ):

  time.sleep(0.1)

  try:
    # get request
    response = requests.get( url, params )
    response.raise_for_status()

  # handle exceptions
  except requests.exceptions.HTTPError as e:
    print("Too many requests!!")
    print(e)
    return None

  except requests.exceptions.URLRequired as e:
    print("Invalid URL: Server could not be found for the given url")
    return None

  else:
    # return the response if no exceptions
    return response

## Function to fetch the details from PluralSight using API

In [78]:
API_URL = 'https://api-us1.cludo.com/api/v3/10000847/10001278/search'
auth_key = "MTAwMDA4NDc6MTAwMDEyNzg6U2VhcmNoS2V5"

In [79]:
all_courses_responses = []

def fetch_pluralsight_course_data(url, page_index):
    global all_courses_responses

    headers = {
        'authorization': f'SiteKey {auth_key}'
    }
    
    data = {
        "ResponseType": "json",
        "query": "web development",
        "enableFacetFiltering": "true",
        "facets": {
            "categories": ["course", "labs"]
        },
        "page": page_index,
        "perPage": 18,
        "operator": "and"
    }
    time.sleep(0.5)
    response = requests.post(url, headers=headers, json=data)
    
    pluralsight_response = response.json()['TypedDocuments'] # or do something else with the response
    all_courses_responses.extend(pluralsight_response)

    if page_index<=10:
        fetch_pluralsight_course_data(url, page_index+1)

In [80]:
fetch_pluralsight_course_data(API_URL, 1)

In [125]:
columns_all = ['course_id', 'course_title', 'course_url', 'course_instructor', 'course_rating', 
               'course_duration', 'course_details','course_level', 'course_no_of_reviews','course_no_of_enrolled']
complete_course_details_df_all = pd.DataFrame(columns=columns_all)

In [126]:
complete_course_details_df_all

Unnamed: 0,course_id,course_title,course_url,course_instructor,course_rating,course_duration,course_details,course_level,course_no_of_reviews,course_no_of_enrolled


- course_id (ud:udemy, ce:coursera, ps: PluralSight)
- course_title
- course_url
- course_instructor
- course_rating (out of 5)
- course_duration (In hrs)
- course_details (description + learning objectives)
- course_level (All: 0, Beginner: 1, Intermediate: 2, Advanced: 3)
- course_no_of_reviews
- course_no_of_enrolled

In [127]:
index  = 0
for plural_sight_response_item in all_courses_responses: 

    course_id = 'ud_'+str(index)
    course_title = ""
    course_instructor = ""
    course_url = ""
    course_rating = ""
    course_no_of_reviews = ""
    course_duration = ""
    course_details = ""
    course_level = ""
    course_no_of_enrolled = ""

    if 'Fields' in plural_sight_response_item:
        if 'Title' in plural_sight_response_item['Fields']:
            course_title = plural_sight_response_item['Fields']['Title']['Value']

        if 'authors' in plural_sight_response_item['Fields']:
            course_instructor = plural_sight_response_item['Fields']['authors']['Value']

        if 'rating-count' in plural_sight_response_item['Fields']:
            course_no_of_reviews = plural_sight_response_item['Fields']['rating-count']['Value']

        if 'rating' in plural_sight_response_item['Fields']:
            course_rating = plural_sight_response_item['Fields']['rating']['Value']
            
        if 'duration' in plural_sight_response_item['Fields']:
            course_duration = plural_sight_response_item['Fields']['duration']['Value']

        if 'Skill Levels' in plural_sight_response_item['Fields']:
            course_level = plural_sight_response_item['Fields']['Skill Levels']['Value']

        if 'Id' in plural_sight_response_item['Fields']:
            course_url = plural_sight_response_item['Fields']['Id']['Value']
            

        current_course_values = [course_id, course_title, course_url, course_instructor, course_rating, course_duration, course_details, 
                                 course_level, course_no_of_reviews, course_no_of_enrolled]
        
        complete_course_details_df_all.loc[index] = current_course_values
        index = index+1

In [128]:
complete_course_details_df_all.head()

Unnamed: 0,course_id,course_title,course_url,course_instructor,course_rating,course_duration,course_details,course_level,course_no_of_reviews,course_no_of_enrolled
0,ud_0,Web Development: Executive Briefing,https://www.pluralsight.com/courses/web-develo...,Brice Wilson,4.6,PT30M2S,,Beginner,158,
1,ud_1,Front End Web Development: Get Started,https://www.pluralsight.com/courses/front-end-...,Joe Eames,4.6,PT3H47M57S,,Beginner,1251,
2,ud_2,Beyond ASP.NET MVC: Modern Web Development Dem...,https://www.pluralsight.com/courses/beyond-asp...,Chris Jones,4.6,PT1H50M33S,,Beginner,75,
3,ud_3,Tactics and Tools for Troubleshooting Front-en...,https://www.pluralsight.com/courses/tactics-to...,Shelley Benhoff,3.7,PT1H37M33S,,Intermediate,54,
4,ud_4,Tracking Real World Web Performance,https://www.pluralsight.com/courses/web-perfor...,Nik Molnar,4.6,PT1H37M33S,,Beginner,145,


In [129]:
level_mapping = {'Beginner': 1, 'Intermediate': 2, 'Advanced': 3}
# Assuming complete_course_details_df_all is your DataFram
complete_course_details_df_all['course_level'] = complete_course_details_df_all['course_level'].map(level_mapping)

In [130]:
# Define a function to convert course_duration to hours
def duration_to_hours(duration):
    if pd.isnull(duration):
        return None
    parts = duration[2:].split('H')
    hours = int(parts[0]) if len(parts) > 1 else 0
    minutes = int(parts[-1].split('M')[0]) if 'M' in duration else 0
    seconds = int(parts[-1].split('S')[0].split('M')[-1]) if 'S' in duration else 0
    return hours + minutes / 60 + seconds / 3600

complete_course_details_df_all['course_duration'] = complete_course_details_df_all['course_duration'].apply(duration_to_hours)

In [131]:
complete_course_details_df_all.head()

Unnamed: 0,course_id,course_title,course_url,course_instructor,course_rating,course_duration,course_details,course_level,course_no_of_reviews,course_no_of_enrolled
0,ud_0,Web Development: Executive Briefing,https://www.pluralsight.com/courses/web-develo...,Brice Wilson,4.6,0.500556,,1,158,
1,ud_1,Front End Web Development: Get Started,https://www.pluralsight.com/courses/front-end-...,Joe Eames,4.6,3.799167,,1,1251,
2,ud_2,Beyond ASP.NET MVC: Modern Web Development Dem...,https://www.pluralsight.com/courses/beyond-asp...,Chris Jones,4.6,1.8425,,1,75,
3,ud_3,Tactics and Tools for Troubleshooting Front-en...,https://www.pluralsight.com/courses/tactics-to...,Shelley Benhoff,3.7,1.625833,,2,54,
4,ud_4,Tracking Real World Web Performance,https://www.pluralsight.com/courses/web-perfor...,Nik Molnar,4.6,1.625833,,1,145,


In [132]:
complete_course_details_df_all = complete_course_details_df_all.reset_index(drop=True)
complete_course_details_df_all['course_id'] = 'ps_' + (complete_course_details_df_all.index + 1).astype(str)

In [133]:
complete_course_details_df_all.head()

Unnamed: 0,course_id,course_title,course_url,course_instructor,course_rating,course_duration,course_details,course_level,course_no_of_reviews,course_no_of_enrolled
0,ps_1,Web Development: Executive Briefing,https://www.pluralsight.com/courses/web-develo...,Brice Wilson,4.6,0.500556,,1,158,
1,ps_2,Front End Web Development: Get Started,https://www.pluralsight.com/courses/front-end-...,Joe Eames,4.6,3.799167,,1,1251,
2,ps_3,Beyond ASP.NET MVC: Modern Web Development Dem...,https://www.pluralsight.com/courses/beyond-asp...,Chris Jones,4.6,1.8425,,1,75,
3,ps_4,Tactics and Tools for Troubleshooting Front-en...,https://www.pluralsight.com/courses/tactics-to...,Shelley Benhoff,3.7,1.625833,,2,54,
4,ps_5,Tracking Real World Web Performance,https://www.pluralsight.com/courses/web-perfor...,Nik Molnar,4.6,1.625833,,1,145,


## Function to preprocess the description of the courses

In [134]:
from lxml import etree

def get_pluralsight_course_details(url):
    
    course_description = ''
    
    response = get_http_response(url)
    time.sleep(0.5)
    
    try:
        html = lx.fromstring(response.text)
        #print(response.text)
        
        course_description_item = html.xpath("//div[@class='course-page-section']/h2[contains(text(), 'What you') and contains(text(), 'learn')]/following-sibling::*")
        if course_description_item:

            if len(course_description_item)>=1:
                course_description = course_description + course_description_item[0].text_content().strip()
                
            if len(course_description_item)>=2:
                course_description = course_description + course_description_item[1].text_content().strip()

        else:
            print("No matching <p> tag found.")
            return ""

    except:
        print("No learning objectives found: " + course_description )
        return ""

    return course_description

In [135]:
get_pluralsight_course_details('https://www.pluralsight.com/courses/web-development-executive-briefing')

"Tech leaders need a fundamental understanding of the tools and technologies their teams use to build solutions. In this course, Web Development: Executive Briefing, you'll get an overview of all the technologies typically used to build modern web applications and the skills required on the teams that build them. First, you'll learn how to structure a web development team and make sure you have the skills required for development and deployment. Next, you'll discover the primary technologies used when building client-side browser applications. Finally, you'll explore the role of the server, server-side developers, and cloud services when building and hosting web applications. When you're finished with this course, you will have a foundational understanding of the technologies used in modern web development that will help you communicate better with your technical teams and understand the skills required on every web development project."

In [136]:
get_pluralsight_course_details('https://www.pluralsight.com/courses/beyond-aspdotnet-web-development-demystified')

"The web development landscape is constantly changing, and it can be hard to keep up. In this course, Beyond ASP.NET MVC: Modern Web Development Demystified, you'll learn the the recent front-ends trends along with what they are, why they exist, and you'll be doing demos to give you a practical insight into how the tool work. First, you'll explore package managers and transpilers. Next, you'll discover JavaScript modules and front-end frameworks. Finally, you'll wrap up this course with learning task runners and module bundlers. By the end of this course, you'll have a high-level understanding of what these tools are and how they can help you as a web developer."

In [137]:
complete_course_details_df_all['course_details'] = complete_course_details_df_all['course_url'].apply(get_pluralsight_course_details)

No matching <p> tag found.
No matching <p> tag found.
Too many requests!!
404 Client Error: Not Found for url: https://www.pluralsight.com/courses/designing-restful-web-apis
No learning objectives found: 
Too many requests!!
404 Client Error: Not Found for url: https://www.pluralsight.com/courses/designing-restful-web-apis
No learning objectives found: 
Too many requests!!
500 Server Error: Internal Server Error for url: https://www.pluralsight.com/courses/go-building-web-services-applications
No learning objectives found: 
No matching <p> tag found.
No matching <p> tag found.
No matching <p> tag found.
No matching <p> tag found.
No matching <p> tag found.


In [138]:
complete_course_details_df_all.head()

Unnamed: 0,course_id,course_title,course_url,course_instructor,course_rating,course_duration,course_details,course_level,course_no_of_reviews,course_no_of_enrolled
0,ps_1,Web Development: Executive Briefing,https://www.pluralsight.com/courses/web-develo...,Brice Wilson,4.6,0.500556,Tech leaders need a fundamental understanding ...,1,158,
1,ps_2,Front End Web Development: Get Started,https://www.pluralsight.com/courses/front-end-...,Joe Eames,4.6,3.799167,Front end web development involves many differ...,1,1251,
2,ps_3,Beyond ASP.NET MVC: Modern Web Development Dem...,https://www.pluralsight.com/courses/beyond-asp...,Chris Jones,4.6,1.8425,The web development landscape is constantly ch...,1,75,
3,ps_4,Tactics and Tools for Troubleshooting Front-en...,https://www.pluralsight.com/courses/tactics-to...,Shelley Benhoff,3.7,1.625833,At the core of any fully responsive website is...,2,54,
4,ps_5,Tracking Real World Web Performance,https://www.pluralsight.com/courses/web-perfor...,Nik Molnar,4.6,1.625833,Study upon study confirms that web performance...,1,145,


In [139]:
complete_course_details_df_all['course_details'].isna().sum()

0

In [140]:
empty_details_indices = complete_course_details_df_all[complete_course_details_df_all['course_details']==''].index.tolist()

In [141]:
empty_details_indices

[23, 34, 66, 75, 123, 131, 134, 150, 185, 186]

In [142]:
complete_course_details_df_all.drop(empty_details_indices, inplace=True)

In [144]:
complete_course_details_df_all = complete_course_details_df_all.reset_index().drop(columns=['index'])

In [148]:
empty_details_indices = complete_course_details_df_all[complete_course_details_df_all['course_details']==""].index.tolist()

In [149]:
empty_details_indices

[]

In [232]:
complete_course_details_df_all.to_csv("pluralsight_course_all_info.csv", index=False)