In [1]:
import warnings
warnings.filterwarnings("ignore")
hdfs_directory = 'hdfs://hadoop-vm.internal.cloudapp.net:9000/twitter'

In [2]:
#pip install hdfs

In [3]:
import os
import requests
import tarfile
import bz2
import json
from hdfs import InsecureClient

In [4]:
keywords = ['COVID-19', 'Coronavirus', 'Pandemic', 'Vaccine', 'Vaccination', 'Immunization', 'COVID vaccine', 'Vaccine rollout', 'Vaccine hesitancy', 'Vaccine mandate', 'Booster shot', 'Vaccine passport', 'Vaccination rate', 'Public health', 'WHO', 'CDC']
hashtags = ['#COVID19', '#Coronavirus', '#Pandemic', '#Vaccine', '#Vaccination', '#GetVaccinated', '#COVIDVaccine', '#Immunization', '#VaccineHesitancy', '#VaccineMandate', '#BoosterShot', '#VaccinePassport', '#PublicHealth', '#StaySafe']

In [5]:
def extract_tar_file(file_path, destination_folder):
    with tarfile.open(file_path, 'r') as tar:
        tar.extractall(path=destination_folder)
        
def extract_bz2_file(file_path, destination_folder):
    with bz2.open(file_path, 'rt') as f_in:
        file_name = os.path.basename(file_path).replace('.bz2', '')
        extracted_file_path = os.path.join(destination_folder, file_name)
        
        with open(extracted_file_path, 'w') as f_out:
            f_out.write(f_in.read())
            
    return extracted_file_path

def get_all_files(directory):
    file_list = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            file_list.append(os.path.join(root, file))
    return file_list

In [6]:
# change directory to dowload folder
output_directory = "data"
os.chdir(output_directory)

In [7]:
def download_twitter_data(year, month, day):
    filename = f"twitter_stream_{year}_{str(month).zfill(2)}_{str(day).zfill(2)}.tar"
    if os.path.exists(filename):
        print(f"Downloaded: {filename}")
        return filename
    
    url = f"https://archive.org/download/archiveteam-twitter-stream-{year}-{str(month).zfill(2)}/twitter_stream_{year}_{str(month).zfill(2)}_{str(day).zfill(2)}.tar"
    response = requests.get(url, stream=True)

    if response.status_code == 200:
        with open(filename, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"Downloaded: {filename}")
        return filename
    else:
        print(f"Failed to download: {url}")



In [10]:
year = 2020
for month in range(1, 13):
    for day in range(1, 32):
        try:
            file_path = download_twitter_data(year, month, day)
            extracted_data_directory = f"{year}/{str(month).zfill(2)}"
            
            if not os.path.exists(extracted_data_directory):
                os.makedirs(extracted_data_directory)

            if os.path.exists(file_path):
                # Extract .tar file
                extract_tar_file(file_path, extracted_data_directory)

            output_file = f"covid-{year}-{str(month).zfill(2)}.json"
            with open(output_file, 'w') as f_out:
                # Extract .bz2
                for bz2_file_path in get_all_files(extracted_data_directory):
                    if bz2_file_path.endswith('.bz2'):
                        extracted_file_path = extract_bz2_file(bz2_file_path, extracted_data_directory)
                        # Read the file
                        with open(extracted_file_path, 'r') as f_in:
                            for line in f_in:
                                tweet = json.loads(line)
                                try:
                                    # Check if tweet contains a keyword or hashtag
                                    if any(keyword in tweet['text'] for keyword in keywords) or any(hashtag in tweet['text'] for hashtag in hashtags):
                                        json.dump(tweet, f_out)
                                        f_out.write('\n')
                                except:
                                    _
                        # Remove extracted files after uploading
                        os.remove(extracted_file_path)
                        os.remove(bz2_file_path)
                
                # Remove year folder
                os.remove(year)
                
                # Remove tar file
                os.remove(file_path)
        except Exception as e:
            print(f"Error downloading data for {year}-{month}-{day}: {e}")


Downloaded: twitter_stream_2020_01_01.tar
Error downloading data for 2020-1-12: remove: path should be string, bytes or os.PathLike, not int


KeyboardInterrupt: 

In [None]:
year = 2020
for month in range(1, 13):
    for day in range(1, 32):
        try:
            file_path = download_twitter_data(year, month, day)
            extracted_data_directory = f"{year}/{str(month).zfill(2)}"
            
            if not os.path.exists(extracted_data_directory):
                os.makedirs(extracted_data_directory)

            if os.path.exists(file_path):
                # Extract .tar file
                extract_tar_file(file_path, extracted_data_directory)

            output_file = f"covid-{year}-{str(month).zfill(2)}.json"
            with open(output_file, 'w') as f_out:
                # Extract .bz2
                for bz2_file_path in get_all_files(extracted_data_directory):
                    if bz2_file_path.endswith('.bz2'):
                        extracted_file_path = extract_bz2_file(bz2_file_path, extracted_data_directory)
                        # Read the file
                        with open(extracted_file_path, 'r') as f_in:
                            for line in f_in:
                                tweet = json.loads(line)
                                try:
                                    # Check if tweet contains a keyword or hashtag
                                    if any(keyword in tweet['text'] for keyword in keywords) or any(hashtag in tweet['text'] for hashtag in hashtags):
                                        json.dump(tweet, f_out)
                                        f_out.write('\n')
                                except:
                                    _
                        # Remove extracted files after uploading
                        os.remove(extracted_file_path)
                        os.remove(bz2_file_path)
                
                # Remove year folder
                os.remove(year)
                
                # Remove tar file
                os.remove(file_path)
        except Exception as e:
            print(f"Error downloading data for {year}-{month}-{day}: {e}")