In [1]:
# General packages
import requests
import json
import time
import datetime
import random
import os
import sys

# data / numerical handling
import pandas as pd
import numpy as np

# data visualization
import seaborn as sb
import matplotlib as mp

# Scraping libraries
from bs4 import BeautifulSoup as bs4

# Scraping functions
from evtolnews_scrapefuncs import *


# Overview

#### About
This notebook contains code for webscraping data on eVTOL / UAM aircraft from https://evtol.news/

#### Packages
* [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) 
* Standard libraries across Python 3* Anaconda distribution.

#### Files
* Jupyter Notebook for analysis: evtolnews_scrape.ipynb
* Python files for custom functions
    * evtolnews_scrapefuncs.py
    
* .csv files for read and writing to dataframes
    * evtolnews_directory_{date}.csv --> dataframe of aircraft links (to scrape)
    * evtolnews_results_df_{date}.csv --> dataframe of scraped and cleaned aircraft data
* .jpeg images for flowchart images
    * evtolnews_overview_flowchart.jpeg
    * evtolnews_append_logic_flowchart.jpeg

# Program Flowchart

![](evtolnews_overview_flowchart.jpeg)

# Part I -- Scrape eVTOL News Directory

## Build dataframe

* The unique evtol.news/... link is used as the index / identifier for each aircraft
* Example: https://evtol.news/embraer-pulse-concept

In [2]:
evtolnews_directory = get_acdirectory()

In [3]:
evtolnews_directory.head()

Unnamed: 0,links,category
0,https://evtol.news/a2-cal-aptos-blue,Vectored Thrust
1,http://evtol.news/a3-by-airbus/,Vectored Thrust
2,https://evtol.news/acs-aviation-z-300/,Vectored Thrust
3,https://evtol.news/advanced-research-foundatio...,Vectored Thrust
4,https://evtol.news/advanced-system-engineering...,Vectored Thrust


## Inspect dataframe 

* shows # of duplicate links if exists
* sum of total aircrafts (links)
* num of aircraft categories
* category counts

In [4]:
# print duplicate links if any
print("There are:", len(evtolnews_directory[evtolnews_directory['links'].duplicated() == True]),"duplicate aircraft links")

print("There are:", len(evtolnews_directory), "aircraft")

print("There are:", len(evtolnews_directory['category'].value_counts()), "aircraft categories")
      
# show aircraft category counts
evtolnews_directory['category'].value_counts()

There are: 0 duplicate aircraft links
There are: 482 aircraft
There are: 5 aircraft categories


Vectored Thrust                        157
Wingless (Multicopter)                 132
Hover Bikes/Personal Flying Devices     82
Lift + Cruise                           78
Electric Rotorcraft                     33
Name: category, dtype: int64

## Optional -- send to .csv

* Sends updated current aircraft directory dataframe to .csv with today() date stamp

In [5]:
# builds .csv with today's date
today = datetime.date.today()
evtolnews_directory.to_csv('evtolnews_directory_{}.{}.{}.csv'.format(today.month, today.day, today.year))

# Part II - Scrape eVTOL.news Aircraft Pages

The code below loops through the links in the 'evtolnews_directory' dataframe and scrapes additional data for each aircraft, building a new dataframe, 'new_acdf' which is updated at a specified increment by the 'pandas.DataFrame.append()' method.

# Get aircraft data

Loop through aircraft links to append results dataframe with additional aircraft data

### Dataframe append logic flowchart

![](evtolnews_append_logic_flowchart.jpeg)

## Instantiate empty dataframe

To be populated with data in the scrape functions.

If you are updating an existing dataframe, skip this step.

In [7]:
# columns to populate
results_df_cols = ['link', 'category', 'name', 'oem', 'model', 'status', 'aircraft_website', 'address', 'about', 'resources', 'specs']

# build dataframe
results_df = pd.DataFrame(data = None, columns = results_df_cols)
results_df.head()

Unnamed: 0,link,category,name,oem,model,status,aircraft_website,address,about,resources,specs


## Run the cell below to scrape aircraft data

In [9]:
# PARAMETERS

# start_ind --> start of 'links' index -- what link to start checking on 
# stop_ind --> End of 'links' index -- what link to stop checking on
# df --> source dataframe for 'links' and 'category'
# results_df --> dataframe to be updated and returned

# WARNING -- this takes ~10 seconds per link to scrape 
# if you have a big dataset, consider moving in small chunks

# This runs a small batch [0:2].
new_acdf = scrape_appendnew(0, 2, evtolnews_directory, results_df)

start time:  Thu Jul  8 18:23:45 2021
end time:  Thu Jul  8 18:24:04 2021

total runtime:  19.362569332122803


## How many left to scrape

In [74]:
count = 0
for link in df['links']:
    if link not in list(results_df['link']):
        print(link)
        count +=1
print('there are: ', count, 'links left to scrape')

http://evtol.news/aeroxo-lv-era-aviabike/
https://evtol.news/daymak-avvenire-skyrider
there are:  2 links left to scrape


## Send to .csv

In [77]:
# builds .csv with today's date
# today = datetime.date.today()
# results_df.to_csv('results_df_{}.{}.{}.csv'.format(today.month, today.day, today.year))

# Part III - Optional -- update existing df

### Read in existing dataframes

In [7]:
# read in stored .csv on local machine
# verify file name date, change as needed
current_df = pd.read_csv('results_df_7.9.2021.csv')
evtolnews_directory = pd.read_csv('evtolnews_directory_7.12.2021.csv')

# drops auto-index generated by .csv file
if 'Unnamed: 0' in list(current_df.columns):
    current_df.drop(columns = ['Unnamed: 0'], inplace = True)
else: None
    
if 'Unnamed: 0' in list(evtolnews_directory.columns):
    evtolnews_directory.drop(columns = ['Unnamed: 0'], inplace = True)
else: None

In [10]:
len(current_df.shape) == len(evtolnews_directory.shape)

True

### Example

#### Criteria:

* We have a dataframe named, current_df.

* We want to update current_df with any new aircraft data if it exists.

* Run the functions below.

### Check for updates

In [11]:
# pulls current aircraft directory, reports status
check_updates(evtolnews_directory, current_df)

'Your df is up to date. No update needed.'

### Update your dataframe

* Use the scrape_appendnew method on your dataframe if it needs to be updated

* adjust start_ind and stop_ind as needed

In [12]:
# WARNING -- this takes ~10 seconds per link to scrape 
# if you have a big dataset, consider moving in small chunks

updated_df = scrape_appendnew(0, len(evtolnews_directory), \
                              evtolnews_directory, current_df
                              )

start time:  Mon Jul 12 12:09:50 2021
end time:  Mon Jul 12 12:09:59 2021

total runtime:  8.53209400177002


### inspect updated dataframe

In [13]:
updated_df.tail()

Unnamed: 0,link,category,name,oem,model,status,aircraft_website,address,about,resources,specs
478,http://evtol.news/zuri/,Lift + Cruise,Zuri,Zuri SE,Zuri,active,https://zuri.com/,"Prague, Czech Republic","In 2017, Czech entrepreneur Michal Illich bega...","['https://zuri.com/', 'https://www.facebook.co...",
479,https://evtol.news/aerofex-aero-x,Hover Bikes/Personal Flying Devices,Aerofex Aero-X,Aerofex,Aero-X,active,https://aerofex.com/,"Manhattan Beach, California, USA","Based on Los Angeles, California, USA, Aerofex...","['https://aerofex.com/', 'https://twitter.com/...","['Aircraft type: eVTOL, hybrid-electric VTOL, ..."
480,https://evtol.news/aerofex-aero-x-nautical,Hover Bikes/Personal Flying Devices,Aerofex Aero-X Nautical,Aerofex,Aero-X,active,https://aerofex.com/,"Manhattan Beach, California, USA","Based on Los Angeles, California, USA, Aerofex...","['https://aerofex.com/', 'https://twitter.com/...","['Aircraft type: eVTOL, hybrid-electric VTOL, ..."
481,http://evtol.news/aeroxo-lv-era-aviabike/,Hover Bikes/Personal Flying Devices,Aeroxo LV ERA Aviabike,Aeroxo LV,ERA Aviabike,active,http://aeroxo.ru/,"Riga, Latvia and Moscow, Russia",Aeroxo’s LV ERA Aviabike is a battery powered ...,"['http://aeroxo.ru/', 'http://aviabike.aeroxo....",[]
482,https://evtol.news/next-ifly,Electric Rotorcraft,NeXt iFLY,NeXt UAS,iFLY,active,https://nextuas.com/,,NeXt UAS was founded in 2016 to produce eVTOL ...,"[https://nextuas.com/, https://www.youtube.com...",[Aircraft type: Single passenger eVTOL designe...


### Update dataframe

In [14]:
# builds .csv with today's date
today = datetime.date.today()
updated_df.to_csv('evtolnews_results_df_{}.{}.{}.csv'.format(today.month, today.day, today.year))

## Clean Up nulls and NAs

### On Nulls and NA response


##### The evtol.news site sometimes varies from their standard template. This causes NAs from the scrape functions.

##### Identify the columns with the most null vals and update manually or inspect their sites, and build new functions to pull correctly.

##### NOTE... Getting zero null values is highly unlikely. Some manual updates will be required.


In [17]:
def check_na(updated_df):
    """ checks dataframe for NA values and returns a dictionary with columns and their NA counts sorted by counts """
    
    na_cols = list(updated_df.columns)

    # find number of NA values for each column if they exist
    # build dictionary
    na_response = {col: updated_df['{}'.format(col)].isna().value_counts()[1] for col in na_cols \
                   if len(updated_df['{}'.format(col)].isna().value_counts())>1}
    
    if len(na_response) != 0:
        
        # sort dict by greatest NAs count 
        new_nas = dict(sorted(na_response.items(), key = lambda item: item[1]))
        return new_nas
    else: return "No NA values in the dataframe"
    
def check_mostna(na_dict):
    """ takes a dict of NA cols and counts, returns a list of tuples with column(s) with most (max) NA counts """

    maxs = []
    for col, count in nas.items():
        if count == max(nas.values()):
            maxs.append((col, count))
    return maxs

In [18]:

def update_na(cols_list, aircraft_link, df):

    """ 
    
    Updates dataframe row by aircraft link key for specified list of na values.
    
    Params:
    
    cols_list --> list of cols to update. Example: ['oem', 'model']
    ** suggested use: one column at a time for easy error isolation.
    
    aircraft_link --> index: str(link) to an aircraft site
    
    df --> dataframe to update
    
    NOTE** .set_index() of df to 'link' before use!
    
    Returns: 
    
    Updated df
    
    
    """
    
    # verify that index is set to the 'link' column
    if df.index.name != 'link':
        df.set_index('link', inplace = True )
    else: None
    
    # get soup object
    acsoup = get_bs4(aircraft_link)

    # to populate with cols:new_vals
    nadict = {}
    
    # Build FUNCTION DICT --> func_dict = {'model': get_acmodel(), 'website': get_acextlink()...}
    core_data = get_coredata(acsoup)
    
    # mapping col keys to associated function values
    func_dict = {'specs': [get_acspecs(acsoup)], 'resources': [get_acresources(acsoup)], 'oem': get_acoem(core_data), \
                
                'model': get_acmodel(core_data), 'aircraft_website': get_acextlink(core_data), 'address': get_acaddress(core_data), \
                
                'about': get_acabout(core_data, acsoup)}
    
    # run functions for new col data, store in dict
    nadict = {col: func_dict[col] for col in cols_list}

#     for val in nadict.values():
#         print(val)
#         print('and the length of the value is ', len(val))
    
    
    # create single-row pd.df to update dataframe
    naupdate = pd.DataFrame(nadict, index = [aircraft_link])

    # update dataframe row
    df.update(naupdate)
    

## Updating a dataframe with na values -- tutorial

* using the update_na() function to update an existing dataframe with null values

#### I -- Check dataframe null values, by column and count

Response tells which columns have null values and how many rows per column

In [19]:
na_cols = check_na(updated_df)
na_cols

{'model': 19, 'about': 33, 'address': 48, 'aircraft_website': 52, 'specs': 414}

#### II -- Get indices for rows in cols with null values

In [20]:
na_cols = list(na_cols.keys())
na_cols = [col for col in na_cols if col != 'specs']
na_cols

['model', 'about', 'address', 'aircraft_website']

In [22]:
# get the indices for aircraft with null values in any of na_cols
na_index = np.unique(np.where(updated_df[na_cols].isna()))
print(len(na_index), 'total rows have null values.')

# build a new subset dataframe for na_df values
na_df = updated_df.iloc[na_index, :]

85 total rows have null values.


#### III -- Loop through rows (links) and update null value columns

*Note: only updates specified columns

In [219]:
# test segment
links = list(na_df['link'])

In [221]:
len(links)

112

#### IV -- Loop through null value links, update dataframe

In [222]:
for link in links:
    print(link)
    
    # update columns in the dataframe, inplace.
    update_na(na_cols, link, updated_df)
    # sleep timer between iterations to prevent IP tagging.
    time.sleep(5)

https://evtol.news/daymak-avvenire-skyrider
https://evtol.news/aeroxo-lv-era-aviabike/
https://evtol.news/ali-technologies-xturismo/
https://evtol.news/ali-technologies-hover-bike/
https://evtol.news/a2-cal-aptos-blue
http://evtol.news/aergility-atlis/
https://evtol.news/aerial-vehicle-automation-winged-x8/
http://evtol.news/aerodyne-vector/
http://evtol.news/airisone/
http://evtol.news/airspacex/
http://evtol.news/alakai-technologies-skai/
https://evtol.news/assen-aerospace-a2-avenger/
http://evtol.news/passenger-drone/
http://evtol.news/athena-aero/
http://evtol.news/autoflightx-bat600/
http://evtol.news/autonomous-flight/
http://evtol.news/avianovations-hepard/
https://evtol.news/baykar-cezeri/
https://evtol.news/caps
https://evtol.news/china-helicopter-research-and-development-institute-electric-helicopter/
http://evtol.news/collaborativebee-mini-bee/
https://evtol.news/colugo-systems
https://evtol.news/copterpack-copterpack
https://evtol.news/CycloTech-Passenger-Demonstrator
http:

In [223]:
updated_df.loc[links]

Unnamed: 0_level_0,category,name,oem,model,status,aircraft_website,address,about,resources,specs
link,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
https://evtol.news/daymak-avvenire-skyrider,Wingless (Multicopter),Daymak Avvenire Skyrider,Daymak Avvenire,,active,,,,[],
https://evtol.news/aeroxo-lv-era-aviabike/,Hover Bikes/Personal Flying Devices,Aeroxo LV ERA Aviabike,Aeroxo LV,ERA Aviabike,active,http://aeroxo.ru/,"Riga, Latvia and Moscow, Russia",Aeroxo’s LV ERA Aviabike is a battery powered ...,"[http://aeroxo.ru/, http://aviabike.aeroxo.com...",
https://evtol.news/ali-technologies-xturismo/,Hover Bikes/Personal Flying Devices,ALI Technologies Xturismo,"A.L.I. Technologies, Inc.",Xturismo Limited Edition,active,http://ali.jp/,"A.L.I. Technologies, Inc.",ALI Technologies is a small start-up company e...,"[http://ali.jp/, https://www.facebook.com/pg/%...",
https://evtol.news/ali-technologies-hover-bike/,Hover Bikes/Personal Flying Devices,ALI Technologies Hover Bike,"A.L.I. Technologies, Inc.",Hover Bike,active,http://ali.jp/,"A.L.I. Technologies, Inc.",,"[http://ali.jp/, https://www.facebook.com/pg/%...",
https://evtol.news/a2-cal-aptos-blue,Vectored Thrust,A2-Cal Aptos Blue,A2-Cal,Aptos Blue,active,https://a2-cal.com/,"Berkeley, California, USA",Jean-Francois Clavreul is an aircraft designer...,"[https://a2-cal.com/, https://www.youtube.com/...",
...,...,...,...,...,...,...,...,...,...,...
https://evtol.news/vertical-aerospace-seraph/,Wingless (Multicopter),Vertical Aerospace VA-X2,Vertical Aerospace Ltd.,VA-X2,active,http://www.vertical-aerospace.com,"Bristol, England, United Kingdom","In 2016, Vertical Aerospace Ltd. was founded b...",[],
http://evtol.news/vision-vtol/,Vectored Thrust,Vision VTOL,Vision VTOL,Vision VTOL,active,https://visionvtol.weebly.com/,,The Waters Trust Vision VTOL is a tiltrotor el...,[],[]
http://evtol.news/volerian/,Vectored Thrust,Volerian,Volerian,Volarian (Test Rig),active,https://volerian.com/,,Volarian is a new concept vectored thrust Vert...,"[https://volerian.com/, https://moneyinc.com/n...",[]
https://evtol.news/kitty-hawk-cora/,Lift + Cruise,Wisk (Kitty Hawk) Cora,Wisk (formerly Kitty Hawk Corp.),Cora,active,http://www.cora.aero,"Mountain View, California, USA","On Dec. 2, 2019, it was announced that The Boe...",[],


In [224]:
updated_df.shape

(482, 10)

#### Send to .csv

In [23]:
# # builds .csv with today's date
# today = datetime.date.today()
# updated_df.to_csv('evtolnews_results_df_{}.{}.{}.csv'.format(today.month, today.day, today.year))

## Data summary

### Aircraft and OEMs

In [24]:
active = updated_df[updated_df['status'] == 'active']

print("There are ", len(active), 'aircraft')

oems_count = len(updated_df['oem'].unique())

print("There are ", oems_count, 'unique OEMs')

There are  446 aircraft
There are  281 unique OEMs


### OEMs with multiple active aircraft

In [25]:
active_oem_counts = active['oem'].value_counts()

active_oem_counts[active_oem_counts >2]

Star 8 Green                                              11
Hover                                                     10
Stuttgart Aerospace                                        8
Dahir Insaat                                               7
Autoflight                                                 6
Grug Group LLC                                             6
Heitech                                                    6
Kovacs                                                     5
Terrafugia                                                 5
Lazzarini Design Studio                                    5
FeralGods Design Company                                   5
Gizio                                                      5
EHang                                                      5
Skyworks Aeronautics Corp.                                 4
Ambular                                                    4
Alauda Aeronautics                                         4
Uber Elevate            

### Category counts by active aircraft

In [26]:
active_categorized = active['category'].value_counts()
active_categorized

Vectored Thrust                        147
Wingless (Multicopter)                 116
Hover Bikes/Personal Flying Devices     77
Lift + Cruise                           76
Electric Rotorcraft                     30
Name: category, dtype: int64

In [27]:
# # builds .csv with today's date
today = datetime.date.today()
updated_df.to_csv('evtolnews_results_df_{}.{}.{}.csv'.format(today.month, today.day, today.year))