In [51]:
#import required libraries
import requests as rq
from bs4 import BeautifulSoup
import pandas as pd

In [52]:
#defining function for getting HTML content or provided URL
def getHTMLPage(page_url):
    response = None
    
    #wrapping code in try except for error catching
    try:
        
        #verify if parameter is NONE
        if page_url == None or page_url == '':
            return response
        
        #retreiving HTML content of URL using requests package
        rs = rq.get(page_url)
    
        #verify success status code 
        if rs.status_code == 200:
            
            #parse HTML content using Beautifulsoup package and set to response 
            response = BeautifulSoup(rs.text, 'html.parser')
        else:
            
            #print message for failed URL
            print(f'Received failed from server for URL: {page_url}')
            
    except BaseException as err:
        #print runtime error
        print(f'Unexpected {err=}, {type(err)=}')
        
    #return response
    return response

In [53]:
#defining function for extracting all programming languages from retreived content
def extractProgrammingLanguages(html_content):
    response = None
    
    #wrapping code in try except for error catching
    try:
        
        #verify if parameter is NONE
        if html_content == None:
            return response
        
        #extracting language list div using HTML tag and attribute 
        div_list = html_content.find_all("div", attrs={"class": "div-col"})
        
        #defining dictionary for storing language name and information URL
        p_dict = {}
        
        #prefix URL for wikipedia
        fixed_url = 'https://en.wikipedia.org'
        
        #iterate over all the div's (a to z) list
        for div in div_list:
            
            #iterate over all ul of div
            for li in div.ul:
                
                #find a tag within li
                a = li.find('a')
                
                #verify if anchor tag has HTML property
                if a != -1 and a != None:
                    
                    #extracting a tag text and target URL
                    p_dict[a.text] = fixed_url + a.get('href')
        
        #assigning dictionary to response
        response = p_dict
        
    except BaseException as err:
        #print runtime error
        print(f'Unexpected {err=}, {type(err)=}')
        
    #return response
    return response

In [54]:
#defining function for extracting infobox of programming language
def extractInfobox(html_content):
    response = None
    
    #wrapping code in try except for error catching
    try:
        #verify if parameter is NONE
        if html_content == None:
            return response
        
        #extracting language information table using HTML tag and attribute 
        table = html_content.find("table", attrs={"class": "infobox vevent"})
        
        #verify if information table is available 
        if table != None and table != '' and table != []:
            
            #assign table to response
            response = table
            
    except BaseException as err:
        #print runtime error
        print(f'Unexpected {err=}, {type(err)=}')
        
    #return response
    return response

In [55]:
#defining function for extracting infobox details
def extractInfoboxDetail(table, keys):
    response = None
    
    #wrapping code in try except for error catching
    try:
        
        #verify if parameter is NONE
        if table == None:
            return response
        
        #defining dictionary for storing table information
        data = {}
        
        #find table body using HTML attribute
        table_body = table.find('tbody')
        
        #find all the rows within table body
        rows = table_body.find_all('tr')
        
        #iterate over all the rows 
        for row in rows:
            
            #defining variable for row heading
            text = None
            
            #find head of row
            thead = row.find('th')
            
            #define variable for storing a tag withing th
            a_head = None
    
            #verify if head is available in row
            if thead != None:
                
                #find a tag withing thead if exists
                a_head = thead.find('a')
            
            #if a tag is not available and only thead is available 
            if a_head == None:
                if thead != None:
                    #assign thead to text with replacing hexacode of space as space
                    text = thead.text.replace(u'\xa0', u' ')
            else:
                #assign a tag's text with replacing hexacode of space as space
                text = a_head.text.replace(u'\xa0', u' ')
            
            #check if row heading available in required keys, if not then continue loop
            if text not in keys:
                continue
                
            #find all the columns within row
            cols = row.find_all('td')
            
            #strip all element withing columns for removing HTML property
            cols = [ele.text.strip() for ele in cols]
            
            #if element has value then replace hexacode of space as space and store in list
            e_list = [ele.replace(u'\xa0', u' ') for ele in cols if ele]
            
            #assign rox value to dictionary
            data[text] = ''.join(e_list).replace(u',', u' & ')
            
            #set response as dictionary
            response = data
            
    except BaseException as err:
        #print runtime error
        print(f'Unexpected {err=}, {type(err)=}')
        
    #return response
    return response

In [56]:
#defining function for generating CSV
def processAndGenerateCSV(p_list, keys, MAX = 50):
    response = None
    
    #wrapping code in try except for error catching
    try:
        #verify if programming language is available
        if pl_list != None:
            
            #extracting all the languages from HTML content
            lang_dict = extractProgrammingLanguages(pl_list)
            
            #defining rows variable for storing language details
            rows = []
            
            #verify if language details are extracted
            if not bool(lang_dict):
                
                #set response message
                response = "No Language information available"
                
                #return response
                return response
            
            #define iteration counter
            itr = 0
            
            #iterate over all the languages information
            for key in lang_dict:
                
                #check whether iteration exceeds MAX iteration limit
                if itr > MAX:
                    break
        
                #get HTML content of language using it's URL
                h_r = getHTMLPage(lang_dict[key])
                
                #extract info table of language 
                table = extractInfobox(h_r)
                
                #define dictionary for storing table information
                info_dict = {}
    
                #verify if table is available
                if table != None:
                
                    #extract table information
                    info_dict = extractInfoboxDetail(table, keys)
                
                #check if information dictionary is null, then assign language
                if not bool(info_dict):
                    info_dict = {'Language':key}
                else:
                    #ass new key of language and its value
                    info_dict['Language'] = key
    
                #append information to rows list
                rows.append(info_dict)    
            
                #increment iteration
                itr += 1
            
            #generate dataframe of information
            df = pd.DataFrame(rows) 
            
            #write csv using pandas
            df.to_csv('Languages.csv', index=False)
            
            #set response
            response = "Process completed"
        else:
            #set response
            response = 'No programming languages available for process'
            
    except BaseException as err:
        #print runtime error
        print(f'Unexpected {err=}, {type(err)=}')
        
    #return response
    return response

In [57]:
#all required field of language information
keys = ['First appeared', 'Developer', 'Designed by', 'Platform', 'OS', 'Website']

#MAX iteration defined for limited processing
MAX = 50

#defining MAIN URL for programming language list
MAIN_URL = 'https://en.wikipedia.org/wiki/List_of_programming_languages'

#getting HTML content of language list page
pl_list = getHTMLPage(MAIN_URL)

print(processAndGenerateCSV(pl_list, keys))

Process completed
