# Extracting the Titanic Disaster Data from Kaggle

### Install python-dotenv package so we can use the .env file with our Kaggle credentials

In [1]:
# uncomment and execute the following line if package not already installed
#
#!pip install python-dotenv

### Import find and load functions from python-dotenv, then use to find the .env file and load environment variables

In [2]:
from dotenv import load_dotenv, find_dotenv

# find .env automatically by walking up directories until found
dotenv_path = find_dotenv()

# load up the entries as environment variables; True means success
load_dotenv(dotenv_path)

True

### Check Kaggle credentials

Obviously these are not my actual credentials; and even if they were, I wouldn't want to display them in this notebook. This is just a code snipet that you can use to check that everything is working correctly up to this point

In [3]:
# load os package
import os

# get Kaggle credentials and print to screen
KAGGLE_USERNAME = os.environ.get("KAGGLE_USERNAME")
KAGGLE_PASSWORD = os.environ.get("KAGGLE_PASSWORD")
print('username: ', KAGGLE_USERNAME, '\npassword: ', KAGGLE_PASSWORD)

username:  your_kaggle_username 
password:  your_kaggle_password


# Download the training and test data from Kaggle

### Load requests library

In [4]:
# import useful stuff
import requests
from requests import session

### Download train data and print to screen (DO NOT DO THIS)

You probably don't want to print to the screen unless the data set is very small, WHICH IT IS NOT IN THIS CASE, so execute this block only if you are certain that's what you want to do.

In [5]:
# payload for post
#payload = {
#    'action' : 'login',
#    'username' : os.environ.get("KAGGLE_USERNAME"),
#    'password' : os.environ.get("KAGGLE_PASSWORD"),
#    'rememberme' : 'false'
#}

# Kaggle URLS
#loginURL = 'https://www.kaggle.com/account/login'
#dataURL = 'https://www.kaggle.com/c/titanic/download/train.csv'

#with session() as c:
#    response = c.get(loginURL).text
#    AFToken = response[response.index('antiForgeryToken')+19:response.index('isAnonymous: ')-12]
#    print("AntiForgeryToken={}".format(AFToken))
#    payload['__RequestVerificationToken']=AFToken
#    c.post(loginURL + "?isModal=true&returnUrl=/", data=payload)
#    response = c.get(dataURL)
#    print(response.text)

### Script for downloading data from Kaggle

In [5]:
# create path to the data subfolder of the src folder
# script file name will be 'get_raw_data.py'
get_raw_data_script_file = os.path.join(os.path.pardir, 'src', 'data', 'get_raw_data.py')

In the next block, write and save the script for future use.

In [6]:
%%writefile $get_raw_data_script_file
# -*- coding: utf-8 -*-
import os
from dotenv import find_dotenv, load_dotenv
from requests import session
import logging # log intermediate steps that have been completed

# payload for post
payload = {
    'action' : 'login',
    'username' : os.environ.get("KAGGLE_USERNAME"),
    'password' : os.environ.get("KAGGLE_PASSWORD"),
    'rememberme' : 'false'
}

# download from url and store in file_path
def extract_data(login_url, data_url, file_path):
    with session() as c:
        response = c.get(login_url).text
        AFToken = response[response.index('antiForgeryToken')+19:response.index('isAnonymous: ')-12]
        payload['__RequestVerificationToken']=AFToken
        c.post(login_url + "?isModal=true&returnUrl=/", data=payload)
        with open(file_path, 'wb') as handle:
            # open url as a stream
            response = c.get(data_url, stream = True)
            # write each block to 1024 bytes
            for block in response.iter_content(1024):
                handle.write(block)
                
# main function, requires the project directory path
def main(project_dir):
    # get logger
    logger = logging.getLogger(__name__)
    logger.info('getting raw data')
    
    # URLs
    loginURL = 'https://www.kaggle.com/account/login'
    trainURL = 'https://www.kaggle.com/c/titanic/download/train.csv'
    testURL = 'https://www.kaggle.com/c/titanic/download/test.csv'
    
    # file paths (local)
    raw_data_path = os.path.join(os.path.pardir, 'data', 'raw')
    train_data_path = os.path.join(raw_data_path, 'train.csv')
    test_data_path = os.path.join(raw_data_path, 'test.csv')

    # extract data
    extract_data(loginURL, trainURL, train_data_path)
    extract_data(loginURL, testURL, test_data_path)
    logger.info('downloaded raw training and test data')
    
if __name__ == '__main__':
    # getting script file name and append parent directory twice
    # helps to move two levels up since path is /titanic_survival/src/data
    project_dir = os.path.join(os.path.dirname(__file__), os.pardir, os.pardir)
    
    # set up logger
    log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    logging.basicConfig(level = logging.INFO, format = log_fmt)
    
    # find .env automatically by walking up directories until found
    dotenv_path = find_dotenv()
    
    # load up the entries as environment variables
    load_dotenv(dotenv_path)
    
    # call main function
    main(project_dir)

Overwriting ../src/data/get_raw_data.py


Run the script

In [8]:
!python $get_raw_data_script_file

2018-08-08 16:28:22,186 - __main__ - INFO - getting raw data
2018-08-08 16:28:29,797 - __main__ - INFO - downloaded raw training and test data
