# Reading and Writing Files

Practice reading and writing CSV and JSON files using a variety of tools

## Writing and Reading Files in Python

### Using `csv` Library

In [3]:
import csv 

# open file in writing mode 
output = open("../data/myCSV.csv", mode="w")

# create a csv writer 
writer = csv.writer(output)

# add a header
header = ["name", "age"]
writer.writerow(header)

# write data to a file 
data = ["john smith", 40]
writer.writerow(data)
output.close()

In [6]:
from faker import Faker 

# open another output and writer 
output = open("../data/faker_csv.csv", mode="w")
writer = csv.writer(output)

# instantate faker 
faker = Faker()

# set headers 
headers = ["name", "age", "street", "city", "state", "zip", "lng", "lat"]
writer.writerow(headers)

# write fake data 
for i in range(1000):
    writer.writerow([
        faker.name(),
        faker.random_int(min=18, max=80, step=1),
        faker.street_address(),
        faker.city(),
        faker.state(),
        faker.zipcode(),
        faker.longitude(),
        faker.latitude()
    ])
output.close()

In [8]:
#
with open("../data/faker_csv.csv") as f:
    reader = csv.DictReader(f)
    headers=next(reader)
    for row in reader:
        print(row["name"])

Christian Cohen
Erin Thomas MD
Daniel Hunt
Derek Sheppard
Stephanie Schwartz
Tanya Price
David Wagner
David Pittman
Albert Escobar
Melissa Stewart
Lori Martinez
Christopher Brewer
Laura Bonilla
Lawrence Koch
Erica Perry
Tracy Newman
Kristen Martin
Troy Long
Kimberly Herman
Ashley Ray
Edward Martinez
Terry Johnson
Linda Sullivan
Denise Schultz
Emily Salinas
Shannon Beasley
Jacqueline Chavez MD
Sue Hill
Christopher Gonzalez
Peggy Thompson
Jennifer Franklin
Nicholas Simpson
Matthew Hess
Marco Boone
Lisa Stephens
Ashlee Mendoza
Joseph Brown
Mark Landry
Kristine Smith
Ryan Barber
Tina Tran
Amanda Martinez
Michael Wright
Juan Scott
Tyler Norris
Alec Espinoza
Casey Mitchell
Deanna Bailey
Tyler Roberts
Courtney Randolph
Benjamin Tucker
Janet Jones
Bryan Espinoza
Sheila Perez
Brenda Edwards
Karen Moss
Tyler Smith
Joshua Hall
Erin Decker
Victoria Allison
Valerie Jones
Shaun Graham
Tony Clark
Jacob Blackburn
John Frederick
Angela Ward
Tommy Ward
Ryan Cobb
Ryan Rodriguez
Jennifer Newman
Vincent Da

### Using `pandas` Library 

In [11]:
import pandas as pd 

# read in the csv
df = pd.read_csv("../data/faker_csv.csv")

# preview 
df.head()

Unnamed: 0,name,age,street,city,state,zip,lng,lat
0,Lori Turner,51,65066 Jimenez Parkway Suite 990,Williamsview,New Mexico,2968,-100.256234,81.14439
1,Christian Cohen,66,01750 Raymond Path Apt. 822,East Zacharychester,Michigan,87523,-137.753161,33.308799
2,Erin Thomas MD,32,392 Angelica Divide,Dennisville,Indiana,51282,-121.900245,-12.621281
3,Daniel Hunt,64,07717 Hart Loaf Apt. 370,Fletcherville,Kansas,84105,-30.142305,47.548051
4,Derek Sheppard,77,52240 Christopher Street Suite 413,Samanthaberg,Montana,90767,17.930999,6.222984


In [21]:
import json 

# instantiate an output and faker
output = open("../data/json_data.json", "w")
faker = Faker()

# instantiate a dictionary and value list 
data = {}
data["data"] = []

# fill the dictionary using faker fields
for x in range(100):
    data["data"].append(
        {
            "id": x, 
            "name": faker.name(), 
            "address": faker.address()
        }
    )

# write the json 
json.dump(data, output, indent=4)

In [23]:
with open("../data/json_data.json", "r") as f:
    data = json.load(f)

df_from_json = pd.json_normalize(data, record_path="data")

## Building Data Pipelines in Apache Airflow

Write sample Python DAG file (make sure to copy into the appropriate DAGs folder on your machine)

In [None]:
import datetime as dt 
from datetime import timedelta
from airflow import DAG
from airflow.operators.bash import BashOperator 
from airflow.operators.python import PythonOperator
import pandas as pd 

# pipeline hlpers
def csv_to_json(input_filepath, output_filepath):
    df = pd.read_csv(input_filepath)
    for i, r in df.iterrows():
        print(r['name'])
    df.to_json(output_filepath, orient='records')


# set default args for
default_args = {
    "owner": "srmarshall", 
    "start_date": dt.datetime(2024, 7, 11), 
    "retries": 1, 
    "retry_delay": timedelta(minutes=2),
}

# create dag 
with DAG(
    "csv_to_json_dag", 
    default_args=default_args,
    schedule_interval=timedelta(minutes=5),
) as dag:
    # create bash operator to confirm DAG is running
    print_starting = BashOperator(
        task_id="starting", 
        bash_command="echo 'Reading CSV...'"
    )

    # use python operator to call function
    CSVtoJSON = PythonOperator(
        task_id="convertCSVtoJSON",
        python_callable=csv_to_json,
        op_kwargs={
            "input_filepath": "/Users/srmarshall/Desktop/code/personal/de-with-python/data/faker_csv.csv",
            "output_filepath": "/Users/srmarshall/Desktop/code/personal/de-with-python/data/faker_json_from_airflow.json",
        }
    )

    # connect tasks by specifying up and downstream tasks
    print_starting >> CSVtoJSON