In [45]:
# A demo of collecting data from YouTube.

from selenium import webdriver
from bs4 import BeautifulSoup
import time, datetime, csv


# The url where the data will be collected from.
url = "https://www.youtube.com/results?search_query=energy+justice"

# A bot can automate the collecting data process. A bot will iminate how an user browse a web page, and then acqure those
# useful information. Therefore, a bot needs to control a browser. Google has released the driver to enable software
# engineerer to control a chrame application. Please download the chromedrive from https://chromedriver.chromium.org/downloads.
# Please make sure you pick the right version (the version of the latest chrome browser is 89.X). the "executable_path"
# indicates the location where you store the driver.
bot = webdriver.Chrome(executable_path="chromedriver/chromedriver")

# Input the targeting url to the bot, and the bot will load data from the url.
bot.get(url)

# Create a csv file to store the structured data after processing.
csvfile = open("assets/utube.csv", "w", newline='', encoding="utf-8") # mode a, r, w   absoluate path, relative patgh

# All the fields of each data entry that I want to collect.
fieldnames = ['username', 'user_url', 'title', 'view_num', 'created_at', 'video_url', 'shortdesc', 'collected_at', 'img_url']

# Create a writer to write the structured data to the csv file.
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

# Write the header to the csv file
writer.writeheader()

# An array to store all the video urls.
video_urls = []


# variable i indicates the number of times that scrolls down a web page. In practice, you might want to develop different
# interaction approach to load and view the web pages.

for i in range(4):

    # Create a document object model (DOM) from the raw source of the crawled web page.
    # Since you are processing a html page, 'html.parser' is chosen.
    soup = BeautifulSoup(bot.page_source, 'html.parser')

    # Capture all the video items using find_all or findAll method.
    # To view the information of the html elements you want to collect, you need to inspect the raw source using Chrome Inspector.
    # To test whether you find the right html elements, you can use the pycharm debugger to examine the returned data.
    videos = soup.find_all('ytd-video-renderer', class_="style-scope ytd-item-section-renderer")[-20:] # 20 indicates only process the newly-acquired 20 entries.

    # iterate and process each video entry.
    for video in videos:

        # I prefer use the "try-except" statement to enable the program run without pausing due to unexecpted errors.
        try:
            video_url = video.find("a", class_="yt-simple-endpoint inline-block style-scope ytd-thumbnail").attrs["href"]
            img_url = video.find("img", class_="style-scope yt-img-shadow").attrs["src"]
            user_url = video.find("a", class_="yt-simple-endpoint style-scope yt-formatted-string").attrs["href"]
            username = video.find("a", class_="yt-simple-endpoint style-scope yt-formatted-string").text
            title = video.find("yt-formatted-string", class_="style-scope ytd-video-renderer").text
            view_num = video.find_all("span", class_="style-scope ytd-video-meta-block")[0].text.replace(" views", "")
            created_at = video.find_all("span", class_="style-scope ytd-video-meta-block")[1].text.replace(" ago", "")
            shortdesc = video.find("yt-formatted-string", id="description-text").text
            collected_at = datetime.datetime.now()

            # create a row in the dict format.
            row = {'video_url': video_url,
                    'user_url': user_url,
                    'username': username,
                    'title': title,
                    'view_num': view_num,
                    'created_at': created_at,
                    'shortdesc': shortdesc,
                    'img_url': img_url,
                    'collected_at': collected_at}

            # if a video has been added to the csvfile, this video would not be inserted to the csv file,
            # otherwise, this video will be inserted.
            if video_url in video_urls:
                print("this video has already been added.")
            else:
                print(row)
                writer.writerow(row)

                # add the video_url to the video_urls array.
                video_urls.append(video_url)
        except:
            pass

    # it is very important to enable the bot take some rest, and then resume to work.
    time.sleep(5)

    # Let the bot scrolls down to the bottom of the content element, most of the time the bot needs to scroll down to the bottom of the page.
    # like this statement: bot.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    bot.execute_script('window.scrollTo(0,  document.getElementById("content").scrollHeight);')

# close the csvfile object and the bot object.
csvfile.close()
bot.close()

# notify the completion of the program in the console.
print("finished")

{'video_url': '/watch?v=9CfWH25PIp0', 'user_url': '/user/greenbiz', 'username': 'GreenBiz', 'title': 'VERGE Talk: Connecting the dots - why energy justice matters', 'view_num': '636', 'created_at': '3 years', 'shortdesc': '', 'img_url': 'https://i.ytimg.com/vi/9CfWH25PIp0/hq720.jpg?sqp=-oaymwEcCOgCEMoBSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLAzSiDw_wEiKQ0OPS3x58MXptJ3Kw', 'collected_at': datetime.datetime(2021, 4, 18, 23, 23, 54, 971927)}
{'video_url': '/watch?v=rS4XEa-hgts', 'user_url': '/channel/UCW3j8gqJ_mAfbbTtRHgFLdA', 'username': 'UC Davis Energy', 'title': 'The Past, Present and Future of Energy Justice', 'view_num': '132', 'created_at': '5 months', 'shortdesc': '', 'img_url': 'https://i.ytimg.com/vi/rS4XEa-hgts/hq720.jpg?sqp=-oaymwEcCOgCEMoBSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLCjYqlAy2S-6WiocCbzGGZrtcGA9w', 'collected_at': datetime.datetime(2021, 4, 18, 23, 23, 54, 975918)}
{'video_url': '/watch?v=iO4D7jCF6Xg', 'user_url': '/user/GundInstitute', 'username': 'Gund Institut

In [17]:
# Search tweets of a specific topic using a web crawler

from selenium import webdriver
from bs4 import BeautifulSoup
import time, datetime, json


# url = "https://twitter.com/search?l=&q=near%3A%22houston%22%20within%3A15mi%20since%3A2017-08-24%20until%3A2017-08-31&src=typd&lang=en"  #crawlling all the tweets posted near Houston during the Hurricane Harvey attacked period.
url = "https://twitter.com/search?f=tweets&vertical=news&q=seattle&src=typd&lang=en"

# use a chrome core. https://chromedriver.chromium.org/downloads
bot = webdriver.Chrome(executable_path="chromedriver/chromedriver") # if you are a mac user, please use "assets/chromedriver"
bot.get(url)

f = open("tweets.csv", "a", encoding="utf-8")
f.write('user_id, user_name, screen_name, status_id, created_at, time_integer, reply_num, retweet_num, favorite_num, content \n')
start = datetime.datetime.now()
time_limit = 60
texts = []

# Read the Xpath tutorial if you are not familiar with XPath.
# "//" operator indicates Selects nodes in the document from the current node that match the selection no matter where they are.
while len(bot.find_elements_by_xpath('//div[contains(text(), "Back to top ‚Üë")]')) != 1:
    time.sleep(5)
    bot.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    soup = BeautifulSoup(bot.page_source, 'html.parser')
    tweets = soup.find_all('li', class_="stream-item")[-20:] # only process the newly-acquired tweets.
    if int((datetime.datetime.now() - start).seconds) >= time_limit: # if longer than a minute, then stop scrolling.
        break
    for tweet in tweets:
        try:
            user_json = json.loads(tweet.div.attrs["data-reply-to-users-json"])
            user_id = int(user_json[0]['id_str'])
            user_name = user_json[0]['screen_name']
            screen_name = user_json[0]['name']
            status_id = int(tweet.attrs["data-item-id"])
            text = tweet.find("p").text.strip().replace("\n", "")
            created_at = tweet.find("small", class_="time").a.attrs["title"]
            time_integer = tweet.find("small", class_="time").a.span["data-time-ms"]
            reply_num = tweet.find("div", class_="ProfileTweet-action--reply").find("span", class_="ProfileTweet-actionCountForPresentation").text
            retweet_num = tweet.find("div", class_="ProfileTweet-action--retweet").find("span", class_="ProfileTweet-actionCountForPresentation").text
            favorite_num = tweet.find("div", class_="ProfileTweet-action--favorite").find("span", class_="ProfileTweet-actionCountForPresentation").text
            inst_url = ""
            if "www.instagram.com" in text:
                inst_url = tweet.p.a.attrs["title"]
            record = '%d, %s, %s, %d, %sÔºå %sÔºå %sÔºå %sÔºå %sÔºå %s \n' % (user_id, user_name, screen_name, status_id, created_at, time_integer, reply_num, retweet_num, favorite_num, text)
            print(record)
            if (text not in texts):
                f.write(record)
            texts.append(text)
        except:
            pass
f.close()
bot.close()
print("finished")

if __name__ == "__main__":
    pass

finished


In [44]:
# Search existing tweets
# @reference: https://www.earthdatascience.org/courses/use-data-open-source-python/intro-to-apis/twitter-data-in-python/


import tweepy, json, time, csv

# Create a csv file to store the structured data after processing.
csvfile = open("assets/tweets.csv", "w", newline='', encoding="utf-8") # mode a, r, w

# All the fields of each data entry that I want to collect.
fieldnames = ['username', 'userid', 'profile_location', 'created_at', 'text', 'retweet_count', 'source', 'coordinates']

# Create a writer to write the structured data to the csv file.
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
# Write the header to the csv file
writer.writeheader()


# Apply for your own Twitter API keys at https://developer.twitter.com/en/apply-for-access
consumer_key = "AuqhYPEBSFigdrFzfqUzHoEov"
consumer_secret = "drIf06K8SNpgtjBOv7WfRIfBySs5rsISIHuQKh2awuHGjZdsZQ"
access_token = "179512083-toy8Ejm6j2BK8I7dkeYnp2V0r8t6yUFO92m10KRf"
access_token_secret = "d6mTUs2FBFCwDbc9WHvMoBRjk2ISES0RVYDGWVIOGhpR3"


auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True)

# Define the search term and the date_since date as variables
search_words = "#energy"
location = "47.6138893,-122.3107869,3000mi"
# read the Twitter API document to look for other ways to customize your queries.
# refer to https://developer.twitter.com/en/docs/twitter-api/v1/rules-and-filtering/search-operators
# for example: you can ignore all the retweets by #wildfires -filter:retweets
# Geolocalization: the search operator ‚Äúnear‚Äù isn‚Äôt available in the API, but there is a more precise way to restrict
# your query by a given location using the geocode parameter specified with the template ‚Äúlatitude,longitude,radius‚Äù,
# for example, ‚Äú47.6138893,-122.3107869,10mi‚Äù (capitol hill at Seattle). When conducting geo searches, the search API will first attempt to find Tweets„ÄÅ
# which have lat/long within the queried geocode, and in case of not having success, it will attempt to find Tweets created
# by users whose profile location can be reverse geocoded into a lat/long within the queried geocode, meaning that is possible
# to receive Tweets which do not include lat/long information.

date_since = "2018-1-1"


# Collect tweets
# tweets = tweepy.Cursor(api.search, q=search_words, lang="en", since=date_since).items(100)
tweets = tweepy.Cursor(api.search, q=search_words, geocode=location, lang="en", since=date_since).items(100)

# Iterate and print tweets
for tweet in tweets:
    row = {
        'username': tweet.author.name,
        'userid': tweet.author.id,
        'profile_location': tweet.author.location,
        'created_at': str(tweet.author.created_at),
        'text': tweet.text,
        'retweet_count': tweet.retweet_count,
        'source': tweet.source,
        'coordinates': tweet.coordinates
    }
    writer.writerow(row)
    print(row)


csvfile.close()
# notify the completion of the program in the console.
print("finished")

{'username': 'Caligirl üïä', 'userid': 189806241, 'profile_location': 'Dying Earth', 'created_at': '2010-09-12 08:06:07', 'text': 'RT @elleprovocateur: Initiatives such as https://t.co/DjNoghFw8m, coupled w/ branded terms  \n#NaturalClimateSolutions &amp; #naturebasedsolutio‚Ä¶', 'retweet_count': 12, 'source': 'Twitter for iPhone', 'coordinates': None}
{'username': 'Randy Chatterjee', 'userid': 966645925, 'profile_location': 'Salish Territory, Cascadia', 'created_at': '2012-11-23 18:51:38', 'text': 'RT @A_Better_City: #green #energy #resilience PARTICIPATE and LEARN! https://t.co/H42xm7HZDn', 'retweet_count': 1, 'source': 'Twitter for Mac', 'coordinates': None}
{'username': 'A Better City', 'userid': 2472716287, 'profile_location': 'Vancouver, BC, Canada', 'created_at': '2014-05-01 15:59:54', 'text': '#green #energy #resilience PARTICIPATE and LEARN! https://t.co/H42xm7HZDn', 'retweet_count': 1, 'source': 'Twitter for Mac', 'coordinates': None}
{'username': 'Association of Electrical 

In [42]:
# Search geo-tagged tweets within the U.S. This script is modified from https://github.com/shawn-terryah/Twitter_Geolocation


import tweepy, json, time, csv

class StreamListener(tweepy.StreamListener):
    """tweepy.StreamListener is a class provided by tweepy used to access
    the Twitter Streaming API to collect tweets in real-time.
    """

    def __init__(self, time_limit=30, file=""):
        """class initialization"""
        self.start_time = time.time()
        self.limit = time_limit
        self.f = open(file, "w", newline='', encoding="utf-8") # mode a, r, w
        fieldnames = ['id', 'username', 'created_at', 'lng', 'lat', 'text']
        self.writer = csv.DictWriter(self.f, fieldnames=fieldnames)
        self.writer.writeheader()
        super(StreamListener, self).__init__()

    def on_data(self, data):
        """This is called when data are streamed in."""
        if (time.time() - self.start_time) < self.limit:
            datajson = json.loads(data)
            print (datajson)
            id = datajson['id']
            username = datajson['user']['screen_name']
            created_at = datajson['created_at']
            text = datajson['text'].strip().replace("\n", "")

            # process the geo-tags
            if datajson['coordinates'] == None:
                bbox = datajson['place']['bounding_box']['coordinates'][0]
                lng = (bbox[0][0] + bbox[2][0]) / 2.0
                lat = (bbox[0][1] + bbox[1][1]) / 2.0
            else:
                lng = datajson['coordinates']['coordinates'][0]
                lat = datajson['coordinates']['coordinates'][1]
            row = {
                'id': id,
                'username': username,
                'created_at': created_at,
                'lng': lng,
                'lat': lat,
                'text': text
            }

            print (row)
            self.writer.writerow(row)
        else:
            self.f.close()
            print ("finish.")
            return False


if __name__ == "__main__":
    # These are provided to you through the Twitter API after you create a account
    # register a Twitter App to get the keys and access tokens.
    output_file = "assets/geotags.csv"

    # Apply for your own Twitter API keys at https://developer.twitter.com/en/apply-for-access
    consumer_key = "AuqhYPEBSFigdrFzfqUzHoEov"
    consumer_secret = "drIf06K8SNpgtjBOv7WfRIfBySs5rsISIHuQKh2awuHGjZdsZQ"
    access_token = "179512083-toy8Ejm6j2BK8I7dkeYnp2V0r8t6yUFO92m10KRf"
    access_token_secret = "d6mTUs2FBFCwDbc9WHvMoBRjk2ISES0RVYDGWVIOGhpR3"


    myauth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    myauth.set_access_token(access_token, access_token_secret)

    # LOCATIONS are the longitude, latitude coordinate corners for a box that restricts the
    # geographic area from which you will stream tweets. The first two define the southwest
    # corner of the box and the second two define the northeast corner of the box.
    LOCATIONS = [-124.7771694, 24.520833, -66.947028, 49.384472,  # Contiguous US
                 -164.639405, 58.806859, -144.152365, 71.76871,  # Alaska
                 -160.161542, 18.776344, -154.641396, 22.878623]  # Hawaii

    stream_listener = StreamListener(time_limit=60, file=output_file)
    stream = tweepy.Stream(auth=myauth, listener=stream_listener)
    stream.filter(locations=LOCATIONS)
    stream.filter(track=['energy justice'], is_async=True)

{'created_at': 'Mon Apr 19 06:15:44 +0000 2021', 'id': 1384027919626821638, 'id_str': '1384027919626821638', 'text': 'Today I breathed a sigh of relief as I received my first vaccine. Feeling grateful. https://t.co/NzcLbStuJ0', 'display_text_range': [0, 83], 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'truncated': False, 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 1299012438998712321, 'id_str': '1299012438998712321', 'name': 'Cst. Nicole Braithwaite', 'screen_name': 'cst_braithwaite', 'location': 'West Vancouver', 'url': 'http://www.wvpd.ca', 'description': 'Community Service/School Liaison Officer with @WestVanPolice. Monitored but not 24-7. Emergency Call 9-1-1. WVPD Non-Emergency 604-925-7300.', 'translator_type': 'none', 'protected': False, 'verified': False, 'followers_count': 701, 'friends_count': 4

In [19]:
time.time()

1618800598.4206707