## Extract course features 

In [None]:
# Handle imports and set SSL

from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
from datetime import datetime
import pickle
import ssl
import requests
import re
import json
import csv

ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

## Use the pickle file which consists of course list to load the data

In [None]:
with open("course_list_20181029.pkl", 'rb') as pickleFile:
    all_course_urls_set = list(pickle.load(pickleFile))

len(all_course_urls_set)

## In case you have already extracted the data and need to extract only the additional course data

In [None]:
# with open("course_list_older_version.pkl", 'rb') as pickleFile:
#     all_course_urls_set_older_v = list(pickle.load(pickleFile))

# with open("course_list_newer_version.pkl", 'rb') as pickleFile:
#     all_course_urls_set_newer_v = list(pickle.load(pickleFile))

# additional_courses_urls = list(set(all_course_urls_set_newer_v) - set(all_course_urls_set_older_v))

# Extract course features 

In [None]:
def get_basic_course_details_ML(url): 

	basic_course_info = {}

	try:
		html_req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
		html = urlopen(html_req, context=ctx).read()
	except HTTPError as e:
		print(e)

	try:
		soup = BeautifulSoup(html, 'html.parser')

		# Course Name - Constant
		basic_course_info['course_name'] = soup.findAll("h1", {"class":"page-title"})[0].get_text()
                                
		# Course Summary
		course_summary_div = soup.find("div", {"class":"intro-curso"})
		if course_summary_div != None :
			course_summary_p = course_summary_div.find("p")
			if course_summary_p != None :
				basic_course_info['course_summary'] = course_summary_p.get_text()
			else :
				basic_course_info['course_summary'] = None
		else :
			basic_course_info['course_summary'] = None

		# Course Description
		course_description = soup.find("div", {"id":"corpoCurso"}).findAll("p")
		descr = ""
		for para in course_description :
			descr = descr + para.get_text() + "\n"
		
		basic_course_info['course_description'] = descr
		
		# Number of Staff - {Various, Etc}
		num_of_staff = soup.find("div", {"class":"field-instructors campo-info"}).findAll("a", {"href":re.compile("\/instructor\/+")})
		if num_of_staff[0].get_text() == 'Various Instructors':
			basic_course_info['num_of_staff'] = None
		else :
			if 'Various Instructors' in num_of_staff :
				basic_course_info['num_of_staff'] = len(num_of_staff) - 1
			else :
				basic_course_info['num_of_staff'] = len(num_of_staff)

		# Course duration {5 weeks, 5 Sessions, Course Not Available, Self-paced}
		course_duration = soup.find("a", {"href":re.compile("\/length\/+")})
		if course_duration != None :
			basic_course_info['course_duration'] = course_duration.get_text()
		else :
			basic_course_info['course_duration'] = None          

		# Course workload in hours per week {Range, Self-Study, Number, No Info}
		workload = soup.find("a", {"href":re.compile("\/estimated-effort\/+")})
		if workload != None :
			basic_course_info['workload'] = workload.get_text()
		else :
			basic_course_info['workload'] = None

		# Compute number of days between two dates
		def compute_days_difference(current_date, course_start_date) :
			suffix = course_start_date.split(" ")[1][-2:]            
			course_date_object = datetime.strptime(course_start_date, '%b %d' + suffix + ' %Y')
			delta = current_date - course_date_object
			return delta.days
            

		# Course Start Date - Date, Self Paced
		course_start_date_div = soup.find("div", {"class":"field field-name-field-start-date-text field-type-text field-label-hidden"})
		if course_start_date_div != None :
			course_start_date = course_start_date_div.find("div", {"class":"field-item even"})
			if course_start_date != None :
				basic_course_info['course_start_date'] = course_start_date.get_text()
				course_duration_text = basic_course_info['course_duration']
				months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
				is_date = basic_course_info['course_start_date'][:3] in months
				if((is_date) and (course_duration_text != None)) and ('Week' in course_duration_text):
					delta = compute_days_difference(datetime.now(), basic_course_info['course_start_date'])
					course_duration_days = int(course_duration_text.split(" ")[0]) * 7
					if(delta<0) :
						basic_course_info['is_course_active'] = 'No'                        
					elif(delta < course_duration_days) :
						basic_course_info['is_course_active'] = 'Yes'
					else :
						basic_course_info['is_course_active'] = 'No'
				else :
					basic_course_info['is_course_active'] = None
			else :
				basic_course_info['course_start_date'] = None
				basic_course_info['is_course_active'] = None
		else :
			basic_course_info['course_start_date'] = None
			basic_course_info['is_course_active'] = None
            
		# Subject area
		subject_area_div = soup.find("div", {"class":"field-categories campo-info"})
		if subject_area_div != None :
			subject_area_cats = subject_area_div.findAll("a", {"href":re.compile("\/categories\/+")})
			if len(subject_area_cats) != 0 : 
				subject_areas_string = "" 
				for subject_area in subject_area_cats :
					subject_areas_string = subject_areas_string+subject_area.get_text()+"; "
				basic_course_info['subject_area'] = subject_areas_string
			else :
				basic_course_info['subject_area'] = None
		else :
			basic_course_info['subject_area'] = None
        
		# Course level
		course_level = soup.findAll("a", {"href":re.compile("\/course-level\/+")})
		if len(course_level) != 0 :
			basic_course_info['course_level'] = course_level[0].get_text()
		else :
			basic_course_info['course_level'] = None

		# Course base learning platform 
		course_platform_div = soup.find("div", {"class":"field-initiative campo-info"})
		if course_platform_div != None :
			course_platform_title = course_platform_div.find("a", {"href":re.compile("\/initiative\/+")})
			if course_platform_title != None : 
				basic_course_info['course_platform'] = course_platform_title.get_text() 
			else :
				basic_course_info['course_platform'] = None
		else :
			basic_course_info['course_platform'] = None

		# University
		university_div = soup.find("div", {"class":"field-university-entity campo-info"})
		if university_div != None :
			university_title = university_div.findAll("a", {"href":re.compile("\/university-entity\/+")})
			num_of_univs = len(university_title)
			if num_of_univs != 0 : 
				univ_string = ""
				for univ in university_title :
					univ_string = univ_string + univ.get_text() + "; "
				basic_course_info['university'] = univ_string
				basic_course_info['num_of_univs'] = num_of_univs
			else :
				basic_course_info['university'] = None
				basic_course_info['num_of_univs'] = 0
		else :
			basic_course_info['university'] = None
			basic_course_info['num_of_univs'] = None

		# Country 
		country_div = soup.find("div", {"class":"field-country campo-info"})
		if country_div != None :
			country_title = country_div.find("a", {"href":re.compile("\/countries\/+")})
			if country_title != None : 
				basic_course_info['country'] = country_title.get_text() 
			else :
				basic_course_info['country'] = None
		else :
			basic_course_info['country'] = None

		# Is the course graded?
		is_graded = soup.find("a", {"href":re.compile("\/exam\/+")})
		if is_graded != None :
			basic_course_info['is_graded'] = is_graded.get_text()[0:3].strip()
			basic_course_info['assessment_type'] = is_graded.get_text()
		else :
			basic_course_info['is_graded'] = None
			basic_course_info['assessment_type'] = None

		# Course Language	{}
		basic_course_info['language'] = soup.find("div", {"class":"field-language campo-info"}).find("a", {"href":re.compile("\/language\/+")}).get_text()
		language_div = soup.find("div", {"class":"field-language campo-info"})
		if language_div != None :
			language_title = language_div.findAll("a", {"href":re.compile("\/language\/+")})
			num_of_langs = len(language_title)
			if num_of_langs != 0 : 
				lang_string = ""
				for lang in language_title :
					lang_string = lang_string + lang.get_text() + "; "
				basic_course_info['language'] = lang_string
				basic_course_info['num_of_langs'] = num_of_langs
			else :
				basic_course_info['language'] = None
				basic_course_info['num_of_langs'] = 0
		else :
			basic_course_info['language'] = None
			basic_course_info['num_of_langs'] = None

		# Course Certificate
		certificate = soup.find("a", {"href":re.compile("\/certificate\/+")}).get_text()
		if('Paid' in certificate) :
			basic_course_info['certificate'] = 'Paid'
		elif('Free' in certificate) :
			basic_course_info['certificate'] = 'Free'
		elif(('No' in certificate) and ('No certificate information' != certificate)) :
			basic_course_info['certificate'] = 'No'
		else :
			basic_course_info['certificate'] = None

		# Course Certificate Fee
		certificate = soup.find("div", {"class":"field-certificate-price campo-info"})
		if(certificate!=None) :    
			certificate_price = certificate.find("div", {"class":"field field-name-field-certificate-price field-type-number-decimal field-label-hidden"}).find("div", {"class":"field-item even"}).get_text()
			certificate_currency = certificate.find("div", {"class":"field field-name-field-certificate-price-currency field-type-taxonomy-term-reference field-label-hidden"}).find("div", {"class":"field-item even"}).get_text()
			basic_course_info['certificate_fee'] = certificate_price + " " + certificate_currency            
		else :
			basic_course_info['certificate_fee'] = None
            
        

		# Variable attributes for each course that needs to be checked
		def check_course_attribute(class_name) :
			if len(soup.findAll("i", {"class":class_name})) == 1 :
				return 'Yes'
			else :
				return 'No'

		# Is Discussions Provided? (Forums)
		basic_course_info['discussion_forum'] = check_course_attribute('fa fa-comment')

		# Is there a Collaborative Project?
		basic_course_info['collaborative_project'] = check_course_attribute('fa fa-cogs')

		# Is additional material provided?
		basic_course_info['additional_material'] = check_course_attribute('fa fa-book')

		# Is there a peer assessment activity? 
		basic_course_info['peer_assessment'] = check_course_attribute('fa fa-group')

		# Does it have video lectures?
		basic_course_info['video_lectures'] = check_course_attribute('fa fa-film')

		# Does it have audio lectures?
		basic_course_info['audio_lectures'] = check_course_attribute('fa fa-headphones')	

		# Average rating (Rating, Number of Votes)
		rating = soup.find("div", {"id":"corpoCurso"})
		average_rating_10_span_1 = rating.find("span", {"class":"average-rating"})
		if average_rating_10_span_1 != None :
			basic_course_info['average_rating_10'] = average_rating_10_span_1.find("span").get_text()
		else :
			basic_course_info['average_rating_10'] = 0
            
		num_of_votes_span_1 = rating.find("span", {"class":"total-votes"})
		if num_of_votes_span_1 != None :
			basic_course_info['num_of_votes'] = num_of_votes_span_1.find("span").get_text()
		else :
			basic_course_info['num_of_votes'] = 0
            
		def final_location(url):
			try:
				headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',}
				response = requests.get(url, timeout=3 , allow_redirects=False , headers=headers)
			except requests.exceptions.HTTPError as errh:
				return url
			except requests.exceptions.ConnectionError as errc:
				return url
			except requests.exceptions.Timeout as errt:
				return url
			except requests.exceptions.RequestException as err:
				return url

			if response.headers.get("Location"):
				return final_location(response.headers.get("Location"))
			else:
				return response.url

		# Course URL - 
		course_url_div = soup.find("div", {"class":"row gotoCurso"})
		if course_url_div != None :
			course_url = course_url_div.find("a")
			if course_url != None :
				course_url_text = course_url['href']
				basic_course_info['course_url'] = final_location(course_url_text)
			else :
				basic_course_info['course_url'] = None
		else :
			basic_course_info['course_url'] = None

            
	except AttributeError as e:
		print(e)

	return basic_course_info

## For each course URL, extract the features and append it to CSV file

In [None]:
# RENAME the FILE based on the date you are extracting -- course_list_YEARMONTHDATE.csv
# NOTE : This takes lot of time as it needs to loop through every page 
# Please make sure you have uninterrupted internet

with open("mooc_list_data_20181029.csv", "w") as file_2:
    writer = csv.writer(file_2, dialect='excel')
    writer.writerow(['Course Name', 'Course Summary', 'Course Description', 'Number of Staff', 'Course Duration', 'Workload (Hours per Week)', 'Course Start Date', 'Is_Course_Active', 'Subject Area', 'Course Level', 'Course Platform', 'Unievrsities', 'Number of Universities', 'Country', 'Is_Graded', 'Assessment Type', 'Language', 'Number of Languages', 'Certificate', 'Certificate Fee', 'Discussion Forum', 'Collaborative Project', 'Additional Material', 'Peer Assessment', 'Video Lectures', 'Audio Lectures', 'Average Rating (10)', 'Number of Votes', 'Course URL'])
    
    # In case a course page is incosistent, the course URL is added to the list which can be inspected later
    had_issues = []
    
    # START EXTRACT
    
    for i, course_url in enumerate(all_course_urls_set) :
        try :
            print(i)
            course_details = get_basic_course_details_ML(course_url)
            writer.writerow(list(course_details.values()))
        except :
            print(course_url)
            had_issues.append(course_url)
    
    # END EXTRACT

# NOTE : Uncomment the following code if you have to extract only additional courses
# ALSO NOTE : Comment the above code starting from label "START EXTRACT"  to "END EXTRACT"

#     for i, course_url in enumerate(additional_courses_urls) :
#         try :
#             print(i)
#             course_details = get_basic_course_details_ML(course_url)
#             writer.writerow(list(course_details.values()))
#         except :
#             print(course_url)
#             had_issues.append(course_url)