In [1]:
# Importing necessary libraries

import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
# Assigning the webpage address to a variable

url = "https://internshala.com/internships/work-from-home-jobs"

In [3]:
# Requesting the url

r = requests.get(url)

In [4]:
# Checking if the request was successful

r.status_code

200

In [5]:
# Creating a beautiful soup object to get access to all the html elements

soup = BeautifulSoup(r.content, "html.parser")

In [6]:
# Store results in a variable

results = soup.find_all('div', {'class':'internship_meta'})

In [7]:
# Extracting the position/domain

results[0].find('div', {'class':'heading_4_5 profile'}).get_text().strip()

'Business Development (Sales)'

In [8]:
# Extracting internship duration

soup.find_all('div', {'class':'item_body'})[1].get_text().strip()

'6 Weeks'

In [9]:
# Extracting stipend details

results[0].find('span', {'class':'stipend'}).get_text().strip()

'3000-7000 /month'

In [10]:
root_url = 'https://internshala.com'

In [11]:
# Extracting (relative) url of the internship posting

rel_url = soup.find('div', {'class':'heading_4_5 profile'}).find('a').get('href')

In [12]:
rel_url

'/internship/detail/business-development-sales-work-from-home-job-internship-at-ostello-india-private-limited1628754664'

In [13]:
# Extracting the total number of pages we need to scrape

total_pages = int(soup.find('span', {'id':'total_pages'}).get_text())
total_pages

209

In [14]:
# Creating empty lists for the different columns we need.
# Theses lists will be filled with the data that is extracted from the webpage.

name = []
duration = []
link = []
stipend = []

# Looping from the first to the last page

for i in range(1, total_pages):
    
    # Requesting each url and creating a beautiful soup object for each of them
    
    website = 'https://internshala.com/internships/work-from-home-jobs/page-'+ str(i)
    r = requests.get(website)
    soup = BeautifulSoup(r.content, "html.parser")
    results = soup.find_all('div', {'class':'internship_meta'})
    
    # Adding the scrapped results to the respective empty lists
    
    for result in results:
        name.append(result.find('div', {'class':'heading_4_5 profile'}).get_text().strip())
        duration.append(result.find_all('div', {'class':'item_body'})[1].get_text().strip())
        link.append(result.find('div', {'class':'heading_4_5 profile'}).find('a').get('href'))

    # Try-except block is used so that the looping doesn't stop if any error is encountered.
        
        try:
            stipend.append(result.find('span', {'class':'stipend'}).get_text().strip())
        except:
            stipend.append('n/a')

In [15]:
# This is used so that we can view the whole url/link in the df

pd.set_option('display.max_colwidth', None)

In [16]:
# Creating a dataframe using the details scrapped from the webpage

df = pd.DataFrame({'name':name, 'duration':duration, 'stipend':stipend, 'link':link})

In [17]:
# Since the loop only extracted the relative url, we need to add the root url to it,
# so that we can directly access the webpage

df['link'] = root_url + df['link']

In [18]:
df.head()

Unnamed: 0,name,duration,stipend,link
0,Business Development (Sales),6 Weeks,3000-7000 /month,https://internshala.com/internship/detail/business-development-sales-work-from-home-job-internship-at-ostello-india-private-limited1628754664
1,Internshala Student Partner,3 Months,,https://internshala.com/internship/detail/internship-at-internshala1628496329
2,Upcycling Ambassador - Social Work Movement (Openings:3000),1 Week,Unpaid,https://internshala.com/internship/details/upcycling-ambassador-social-work-movement-openings3000-work-from-home-job-internship-at-team-everest1628185266
3,Product Delivery Management,6 Months,15000 /month,https://internshala.com/internship/detail/product-delivery-management-work-from-home-job-internship-at-times-internet1628847617
4,YouTube Operations,3 Months,5000 /month,https://internshala.com/internship/detail/youtube-operations-work-from-home-job-internship-at-9x-media-private-limited1628774006


In [19]:
# Filtering the df according to our prefernces
# The filtered df is created as a new df so that skills data can be extracted from it in the next step.

df_sorted = df[(df['duration']=='6 Months') & (df['name'].str.contains('Analy')) & (df['stipend']!='Unpaid')]
df_sorted.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26 entries, 226 to 8213
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   name      26 non-null     object
 1   duration  26 non-null     object
 2   stipend   26 non-null     object
 3   link      26 non-null     object
dtypes: object(4)
memory usage: 1.0+ KB


In [20]:
# Skills data is only extracted from the filtered data as extracting it from all the job postings will take a very long time

skills = []

for link in df_sorted['link']:
    r = requests.get(link)
    soup = BeautifulSoup(r.content, "html.parser")
    results = soup.find_all('div', {'class':'detail_view'})
    
    for result in results:
    # Since skills are not mentioned in the all the listings, it will throw an error.
    # To avoid the error, try-except block is used.
        try:
            skills.append(result.find_all('div', {'class':'round_tabs_container'})[0].get_text().strip().replace('\n', ' - '))
        except:
            skills.append('n/a')

In [21]:
# Combining the skills column with the previously created df_sorted so that all the details can be viewed at a time

df_sorted.insert(2, 'skills', skills)

In [22]:
# Sorting the df_sorted even further to match our preferences
# The (?i) in the regex pattern tells the re module to ignore case

df_sorted[df_sorted['skills'].str.contains('(?i)sql')].link

510               https://internshala.com/internship/detail/data-analytics-work-from-home-job-internship-at-angel-broking1628839339
5165               https://internshala.com/internship/detail/business-analysis-work-from-home-job-internship-at-workxmate1628166201
5492              https://internshala.com/internship/detail/business-analytics-work-from-home-job-internship-at-kofluence1628144493
5559             https://internshala.com/internship/detail/product-qa-analysis-work-from-home-job-internship-at-kofluence1628141497
5734    https://internshala.com/internship/detail/business-analytics-work-from-home-job-internship-at-fragma-data-systems1628087083
Name: link, dtype: object