### Web Scraping using Beautiful Soup

Objective : DonorsChoose.org is an organization which collects funds from people to help public schools in America. Below we have scraped the https://www.niche.com/ website to get more information about schools, which will help in enhancing our DonorsChoose datasets to get more insights.

#### Import libraries

In [0]:
# import libraries
import urllib.request
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
import csv
from datetime import datetime
import re

#### Get Data from the URL using BeautifulSoup
@Author : Pragya & Sithara

In [0]:
def getData(url):

  #we use the html parser to parse the url content and store it in a variable.
  textContent = []
  validate = lambda x : x.text if x is not None else np.nan

  for i in range(386): 
    i=i+1 # Number of pages plus one 
    #url = url.format(i)
    #https://www.niche.com/k12/search/best-schools/s/texas/?type=traditional&type=charter&type=magnet&gradeLevel=pk&gradeLevel=elementary&gradeLevel=middle&gradeLevel=high&page={}
    #https://www.niche.com/k12/search/best-schools/s/california/?gradeLevel=pk&gradeLevel=elementary&gradeLevel=middle&gradeLevel=high&type=traditional&type=charter&type=magnet
    url = "https://www.niche.com/k12/search/best-schools/s/california/?gradeLevel=pk&gradeLevel=elementary&gradeLevel=middle&gradeLevel=high&type=traditional&type=charter&type=magnet&page={}".format(i)
    #print(i)
    #print(url)
    with urllib.request.urlopen(url) as response:
      page = response.read()

    # parse the html using beautiful soup and store in variable `soup`
    soup = BeautifulSoup(page, 'html.parser')

    #Get the Data from the html
    mydivs = soup.findAll("li", {"class": "search-results__list__item"})
    for li in mydivs:
      if(li != None):
        unwanted = li.findAll(True, {"class":["search-result--featured","illustrated-cta","ad-spot", "ad-spot--mobile", "ad-spot--inactive", "search-related-lists"]})
        if (len(unwanted) > 0):
          continue
        name = li.find("h2",{"class":"search-result__title"})
        schoolName = validate(name)
        if('.' in schoolName):
          schoolName = schoolName.replace('.','')
        rank = li.find("div",{"class":"search-result-badge"})
        schoolRank = validate(rank)
        #print(schoolRank)
        tagline = li.find("ul", {"class":"search-result-tagline"})
        if(tagline !=None):
          ratingSpan = tagline.find("span", {"class":"review__stars__icon"})
          rating = (int)(ratingSpan['class'][1][-2:])/10 if (ratingSpan !=None) else np.nan 
          #print(rating)
          reviewCountSpan = tagline.find("span", {"class":"review__stars__number__reviews"})
          reviewCount = validate(reviewCountSpan)
        district = tagline.find(string=re.compile("[A-Z]$"))
        #schoolDistrict = district if (district !=None) else np.nan
        S_District = district if (district !=None) else np.nan
        result = [x.strip() for x in S_District.split(',')]
        schoolDistrict = result[0].replace('Independent', 'Ind')
        #print(schoolDistrict)
        schoolState = result[1]
        #print(schoolState)
        #print(schoolDistrict)
        level = tagline.find(string=re.compile("[A-Z0-9][-]{1}[0-9]"))
        schoolGradeLevel = level if (level !=None) else np.nan
        #print(schoolGradeLevel)
        schoolGrades = li.findAll("li",{"class":"search-result-fact-list__item"})
        grade = li.find("ul",{"class":"search-result-fact-list"}).find("div",{"class":"niche__grade"})
        schoolGrade = validate(grade)
        studentCount = schoolGrades[1].find("span",{"class":"search-result-fact__value"})
        schoolStrength = validate(studentCount)
        stRatio = schoolGrades[2].find("span",{"class":"search-result-fact__value"})
        studentTeacherRatio = validate(stRatio)
        #print(studentTeacherRatio)

        textContent.append((schoolName, schoolRank, schoolDistrict,schoolState, schoolGradeLevel, rating, reviewCount, schoolGrade, schoolStrength, studentTeacherRatio))
    
  return textContent


In [0]:
url = "https://www.niche.com/k12/search/best-schools/s/texas/?type=traditional&type=charter&type=magnet&gradeLevel=pk&gradeLevel=elementary&gradeLevel=middle&gradeLevel=high&page={}"
textContent = getData(url)

In [0]:
print('\n'.join(map(str, textContent)))

('Canyon Crest Academy', '#16 Best Public High Schools in America', 'San Dieguito Union High School District', 'CA', '9-12', 4.5, '495', 'A+', '2,255', '26:1')
('Henry M Gunn High School', '#21 Best Public High Schools in America', 'Palo Alto Unified School District', 'CA', '9-12', 4.0, '260', 'A+', '1,918', '19:1')
('Troy High School', '#42 Best Public High Schools in America', 'Fullerton Joint Union High School District', 'CA', '9-12', 4.0, '794', 'A+', '2,755', '27:1')
('Palo Alto High School', '#51 Best Public High Schools in America', 'Palo Alto Unified School District', 'CA', '9-12', 4.0, '280', 'A+', '1,993', '19:1')
('Miramonte High School', '#57 Best Public High Schools in America', 'Acalanes Union High School District', 'CA', '9-12', 4.0, '190', 'A+', '1,232', '20:1')
('Northwood High School', '#67 Best Public High Schools in America', 'Irvine Unified School District', 'CA', '9-12', 4.0, '457', 'A+', '2,240', '28:1')
('Saratoga High School', '#77 Best Public High Schools in A

#### Create csv and write the data into it.
@Author : Vidhi & Reetika

In [0]:
# open a csv file with append, so old data will not be erased
with open('schoolRating.csv', 'w') as csv_file:
  writer = csv.writer(csv_file)
  writer.writerow(['School Name', 'Rank in America','District','State', 'School Grade Level','Rating','User Reviews','Overall Niche Grade','School Strength','Student Teacher Ratio', 'Extracted Date'])
 # The for loop
  for schoolName, schoolRank,schoolDistrict,schoolState, schoolGradeLevel ,rating,reviewCount,schoolGrade,schoolStrength,studentTeacherRatio in textContent:
    writer.writerow([schoolName, schoolRank, schoolDistrict,schoolState, schoolGradeLevel,rating,reviewCount,schoolGrade,schoolStrength,studentTeacherRatio, datetime.now()])

#### Read the created dataset

In [0]:
#Read csv
df_itemlist = pd.read_csv('schoolRating.csv')
df_itemlist.shape

(9647, 11)

In [0]:
df_itemlist.head(45)

Unnamed: 0,School Name,Rank in America,District,State,School Grade Level,Rating,User Reviews,Overall Niche Grade,School Strength,Student Teacher Ratio,Extracted Date
0,Canyon Crest Academy,#16 Best Public High Schools in America,San Dieguito Union High School District,CA,9-12,4.5,495.0,A+,2255,26:1,2019-03-19 00:43:04.494717
1,Henry M Gunn High School,#21 Best Public High Schools in America,Palo Alto Unified School District,CA,9-12,4.0,260.0,A+,1918,19:1,2019-03-19 00:43:04.494741
2,Troy High School,#42 Best Public High Schools in America,Fullerton Joint Union High School District,CA,9-12,4.0,794.0,A+,2755,27:1,2019-03-19 00:43:04.494751
3,Palo Alto High School,#51 Best Public High Schools in America,Palo Alto Unified School District,CA,9-12,4.0,280.0,A+,1993,19:1,2019-03-19 00:43:04.494760
4,Miramonte High School,#57 Best Public High Schools in America,Acalanes Union High School District,CA,9-12,4.0,190.0,A+,1232,20:1,2019-03-19 00:43:04.494768
5,Northwood High School,#67 Best Public High Schools in America,Irvine Unified School District,CA,9-12,4.0,457.0,A+,2240,28:1,2019-03-19 00:43:04.494776
6,Saratoga High School,#77 Best Public High Schools in America,Los Gatos-Saratoga Joint Union High School Dis...,CA,9-12,4.0,157.0,A+,1305,18:1,2019-03-19 00:43:04.494783
7,Torrey Pines High School,#80 Best Public High Schools in America,San Dieguito Union High School District,CA,9-12,4.0,401.0,A+,2601,27:1,2019-03-19 00:43:04.494791
8,Palos Verdes Peninsula High School,#94 Best Public High Schools in America,Palos Verdes Peninsula Unified School District,CA,9-12,4.0,576.0,A+,2519,26:1,2019-03-19 00:43:04.494821
9,Los Altos High School,#101 Best Public High Schools in America,Mountain View-Los Altos Union High School Dist...,CA,9-12,4.0,263.0,A+,2039,21:1,2019-03-19 00:43:04.494830
