# pandas - Data Loading, Storage, and File Formats

## Reading and Writing Data in Text Format

In [1]:
import pandas as pd
from pandas import Series, DataFrame

In [2]:
!cat ../input/pandas-data-loading-storage-file-formats/ex1.csv

In [3]:
df = pd.read_csv('../input/pandas-data-loading-storage-file-formats/ex1.csv')
df

In [4]:
pd.read_table('../input/pandas-data-loading-storage-file-formats/ex1.csv', sep=',')

In [5]:
pd.read_csv('../input/pandas-data-loading-storage-file-formats/ex1.csv', header=None) # specify column names to None

In [6]:
pd.read_csv('../input/pandas-data-loading-storage-file-formats/ex2.csv', names=['a', 'b', 'c', 'd', 'message'])

In [7]:
names = ['a', 'b', 'c', 'd', 'message']
pd.read_csv('../input/pandas-data-loading-storage-file-formats/ex2.csv', names=names, index_col='message') # making a column as an index

In [8]:
!cat ../input/pandas-data-loading-storage-file-formats/csv_mindex.csv

In [9]:
parsed = pd.read_csv('../input/pandas-data-loading-storage-file-formats/csv_mindex.csv', index_col=['key1', 'key2'])
parsed

In [10]:
list(open('../input/pandas-data-loading-storage-file-formats/ex3.txt'))

In [11]:
result = pd.read_table('../input/pandas-data-loading-storage-file-formats/ex3.txt', sep='\s+') # can use regular expressions when passed as to the seperator
result

In [12]:
!cat ../input/pandas-data-loading-storage-file-formats/ex4.csv

In [13]:
pd.read_csv('../input/pandas-data-loading-storage-file-formats/ex4.csv', skiprows=[0, 2, 3]) # skipping index 0 2 3

In [14]:
!cat ../input/pandas-data-loading-storage-file-formats/ex5.csv

In [15]:
result = pd.read_csv('../input/pandas-data-loading-storage-file-formats/ex5.csv')
result

In [16]:
pd.isnull(result)

In [17]:
result = pd.read_csv('../input/pandas-data-loading-storage-file-formats/ex5.csv', na_values=['NULL']) # set all na values to NULL or NaN
result

In [18]:
sentinels = {'message': ['foo', 'NA'], 'something': ['two']} # NA comes from raw CSV
pd.read_csv('../input/pandas-data-loading-storage-file-formats/ex5.csv', na_values=sentinels) # specifying na values for each corresponding values in sentinels

### Reading Text Files in Pieces

When processing very large files or figuring out the right set of arguments to correctly process a large file, you may only want to read in a small piece of a file or iterate through smaller chunks of the file.

In [19]:
pd.options.display.max_rows = 10 # making pandas display more compact

In [20]:
result = pd.read_csv('../input/pandas-data-loading-storage-file-formats/ex6.csv')
result

In [21]:
pd.read_csv('../input/pandas-data-loading-storage-file-formats/ex6.csv', nrows=5) # reading n amount of rows

In [22]:
chunker = pd.read_csv('../input/pandas-data-loading-storage-file-formats/ex6.csv', chunksize=1000) # read file in pieces (chunksize to specify the number of rows)
chunker # this allows us to iterate over the parts of the file according to the chunksize

In [23]:
tot = Series([])
for piece in chunker:
    tot = tot.add(piece['key'].value_counts(), fill_value=0)
tot = tot.sort_values(ascending=False)

In [24]:
tot[:10]

### Writing Data to Text Format

In [25]:
data = pd.read_csv('../input/pandas-data-loading-storage-file-formats/ex5.csv')
data.to_csv('out.csv')

In [26]:
import sys
data.to_csv(sys.stdout, sep="|") # using sys.stdout so it prints the text result to the console (this time the delimiter is |)

In [27]:
data.to_csv(sys.stdout, na_rep='NULL') # replace NA values with NULL

In [28]:
data.to_csv(sys.stdout, index=False, header=False) # not including any indexes and columns

In [29]:
import numpy as np
date_range = 7
dates = pd.date_range('1/1/2000', periods=date_range) # 7 days from starting point
ts = Series(np.arange(date_range), index=dates) # give index to dates and forms a Series
ts.to_csv('tseries.csv')

!cat tseries.csv

### Working with Delimited Formats

In [30]:
!cat ../input/pandas-data-loading-storage-file-formats/ex7.csv

In [31]:
import csv
f = open('../input/pandas-data-loading-storage-file-formats/ex7.csv')
reader = csv.reader(f)
for line in reader:
    print(line)

In [32]:
with open('../input/pandas-data-loading-storage-file-formats/ex5.csv') as f:
    lines = list(csv.reader(f))

In [33]:
header, values = lines[0], lines[1:]

In [34]:
data_dict = {h: v for h, v in zip(header, zip(*values))}
data_dict

### JSON Data

In [37]:
obj = """
    {"name": "Wes",
     "places_lived": ["United States", "Spain", "Germany"],
     "pet": null,
     "siblings": [{"name": "Scott", "age": 30, "pets": ["Zeus", "Zuko"]},
                  {"name": "Katie", "age": 38,
                   "pets": ["Sixes", "Stache", "Cisco"]}]
} """

In [38]:
import json
result = json.loads(obj)
result

In [39]:
asjson = json.dumps(result) # convert to json from python

In [40]:
siblings = DataFrame(result['siblings'], columns=['name', 'age'])
siblings

In [41]:
!cat ../input/pandas-data-loading-storage-file-formats/example.json

In [43]:
data = pd.read_json('../input/pandas-data-loading-storage-file-formats/example.json')
data

In [44]:
print(data.to_json()) # export from pandas to JSON

In [45]:
print(data.to_json(orient='records'))

### Reading Microsoft Excel Files

In [47]:
!pip install openpyxl

In [49]:
xlsx = pd.ExcelFile('../input/pandas-data-loading-storage-file-formats/ex1.xlsx') # create an instance
pd.read_excel(xlsx, 'Sheet1') # read excel file

In [51]:
frame = pd.read_excel('../input/pandas-data-loading-storage-file-formats/ex1.xlsx', 'Sheet1') # faster way to load excel file
frame

In [52]:
writer = pd.ExcelWriter('ex2.xlsx')
frame.to_excel(writer, 'Sheet1')
writer.save() # export

In [53]:
frame.to_excel('ex2.xlsx') # faster way to export

## Interacting with Web APIs

Many websites have public APIs providing data feeds via JSON or some other format. There are a number of ways to access these APIs from Python; one easy-to-use method is using the `requests` package.

In [54]:
import requests
url = 'https://api.github.com/repos/pandas-dev/pandas/issues'
resp = requests.get(url)
resp

In [55]:
data = resp.json() # parse the information to json

In [58]:
data[0]['title']

In [59]:
issues = DataFrame(data, columns=['number', 'title', 'labels', 'state'])
issues

## Interacting with Databases

In [60]:
import sqlite3
query = """CREATE TABLE test (a VARCHAR(20), b VARCHAR(20), c REAL, d INTEGER);"""

In [61]:
con = sqlite3.connect('mydata.sqlite') # create db
con.execute(query) # create table
con.commit()

In [64]:
data = [('Atlanta', 'Georgia', 1.25, 6),
        ('Tallahassee', 'Florida', 2.6, 3),
        ('Sacramento', 'California', 1.7, 5)]
stmt = "INSERT INTO test VALUES(?, ?, ?, ?)"

In [65]:
con.executemany(stmt, data)
con.commit() # insert

In [68]:
cursor = con.execute('select * from test')
rows = cursor.fetchall() # get data from table 'test'
rows

In [69]:
cursor.description

In [70]:
DataFrame(rows, columns=[x[0] for x in cursor.description]) # columns are abcd here

### Using the SQLAlchemy project

In [71]:
import sqlalchemy as sqla

In [72]:
db = sqla.create_engine('sqlite:///mydata.sqlite') # create db

In [73]:
pd.read_sql('select * from test', db) # get data from table 'test'