## CH 03 - Creating and Persisting DataFrames

In [2]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 10, 'display.max_rows', 10)

### How to do it\...

In [3]:
# Usually, we create a DataFrame from an existing file or a database, but we can also create
# one from scratch. We can create a DataFrame from parallel lists of data.

In [4]:
fname = ['Paul', 'John', 'Richard', 'George']
lname = ['McCartney', 'Lennon', 'Starkey', 'Harrison']
birth = [1942, 1940, 1940, 1943]

In [5]:
# Create a dictionary from the lists, mapping the column name to the list:

people = {'first': fname, 'last': lname, 'birth': birth}

In [6]:
# Create a DataFrame from the dictionary:

beatles = pd.DataFrame(people)
beatles

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


### How it works\...

In [7]:
# By default, pandas will create a RangeIndex for our DataFrame when we call the constructor:

beatles.index

RangeIndex(start=0, stop=4, step=1)

In [8]:
# We can specify another index for the DataFrame if we desire:

pd.DataFrame(people, index=['a', 'b', 'c', 'd'])

Unnamed: 0,first,last,birth
a,Paul,McCartney,1942
b,John,Lennon,1940
c,Richard,Starkey,1940
d,George,Harrison,1943


### There\'s More

In [9]:
# You can also create a DataFrame from a list of dictionaries:

pd.DataFrame(
[{"first":"Paul","last":"McCartney", "birth":1942},
 {"first":"John","last":"Lennon", "birth":1940},
 {"first":"Richard","last":"Starkey", "birth":1940},
 {"first":"George","last":"Harrison", "birth":1943}])

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


### Writing CSV\...

In [14]:
beatles

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [15]:
# Write the DataFrame to a CSV file

from io import StringIO
fout = StringIO()
beatles.to_csv(fout)  # use a filename instead of fout

In [16]:
print(fout.getvalue())

,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943



### There\'s More

In [17]:
# The .to_csv method has a few options. You will notice that it included the index in the
# output but did not give the index a column name. If you were to read this CSV file into
# a DataFrame using the read_csv function, it would not use this as the index by default.

# Instead, you will get a column named Unnamed: 0 in addition to an index. These columns
# are redundant:

_ = fout.seek(0)
pd.read_csv(fout)

Unnamed: 0.1,Unnamed: 0,first,last,birth
0,0,Paul,McCartney,1942
1,1,John,Lennon,1940
2,2,Richard,Starkey,1940
3,3,George,Harrison,1943


In [None]:
# fout: It represents a file on the filesystem.

# seek(0): The seek method is used to change the current file position. In this case, seek(0) 
# is setting the file pointer to the beginning of the file (offset 0).

# _ = : The underscore _ is a convention in Python often used as a throwaway variable name when 
# the value of the variable is not going to be used. It indicates that the result of the seek(0) 
# operation is being ignored or not explicitly used in the code.

# So, the overall effect of this line of code is to move the file pointer to the beginning of the 
# file represented by the fout object. This can be useful, for example, when you want to read the 
# contents of the file again from the start or overwrite its contents.

In [19]:
# The read_csv function has an index_col parameter that you can use to specify the
# location of the index:

_ = fout.seek(0)
pd.read_csv(fout, index_col=0)

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [20]:
# Alternatively, if we didn't want to include the index when writing the CSV file, we can set the
# index parameter to False:

fout = StringIO()
beatles.to_csv(fout, index=False) 
print(fout.getvalue())

first,last,birth
Paul,McCartney,1942
John,Lennon,1940
Richard,Starkey,1940
George,Harrison,1943



In [None]:
# The pandas library is an in-memory tool. You need to be able to fit your data in memory to use
# pandas with it. If you come across a large CSV file that you want to process, you have a few
# options. 

# If you can process portions of it at a time, you can read it into chunks and process
# each chunk. Alternatively, if you know that you should have enough memory to load the file,
# there are a few hints to help pare down the file size.

# Note that in general, you should have three to ten times the amount of memory as the size
# of the DataFrame that you want to manipulate. Extra memory should give you enough extra
# space to perform many of the common operations.

In [21]:
# Reading large CSV files
# Determine how much memory the whole file will take up. We will use the nrows
# parameter of read_csv to limit how much data we load to a small sample:

diamonds = pd.read_csv('../data/diamonds.csv', nrows=1000)
diamonds

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
995,0.54,Ideal,D,VVS2,61.4,52.0,2897,5.30,5.34,3.26
996,0.72,Ideal,E,SI1,62.5,55.0,2897,5.69,5.74,3.57
997,0.72,Good,F,VS1,59.4,61.0,2897,5.82,5.89,3.48
998,0.74,Premium,D,VS2,61.8,58.0,2897,5.81,5.77,3.58


In [22]:
# Use the .info method to see how much memory the sample of data uses:

diamonds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    1000 non-null   float64
 1   cut      1000 non-null   object 
 2   color    1000 non-null   object 
 3   clarity  1000 non-null   object 
 4   depth    1000 non-null   float64
 5   table    1000 non-null   float64
 6   price    1000 non-null   int64  
 7   x        1000 non-null   float64
 8   y        1000 non-null   float64
 9   z        1000 non-null   float64
dtypes: float64(6), int64(1), object(3)
memory usage: 78.3+ KB


In [None]:
# We can see that 1,000 rows use about 78.2 KB of memory. If we had 1 billion
# rows, that would take about 78 GB of memory. It turns out that it is possible to rent
# machines in the cloud that have that much memory but let's see if we can take it
# down a little.

In [23]:
# Use the dtype parameter to read_csv to tell it to use the correct (or smaller) numeric types:

diamonds2 = pd.read_csv('../data/diamonds.csv', nrows=1000,
    dtype={'carat': np.float32, 'depth': np.float32,
           'table': np.float32, 'x': np.float32,
           'y': np.float32, 'z': np.float32,
           'price': np.int16})

In [24]:
# By changing the numeric types, we use about 62% of the memory.

diamonds2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    1000 non-null   float32
 1   cut      1000 non-null   object 
 2   color    1000 non-null   object 
 3   clarity  1000 non-null   object 
 4   depth    1000 non-null   float32
 5   table    1000 non-null   float32
 6   price    1000 non-null   int16  
 7   x        1000 non-null   float32
 8   y        1000 non-null   float32
 9   z        1000 non-null   float32
dtypes: float32(6), int16(1), object(3)
memory usage: 49.0+ KB


In [25]:
# Use the dtype parameter to use change object types to categoricals. First, inspect
# the .value_counts method of the object columns. If they are low cardinality, you
# can convert them to categorical columns to save even more memory:

diamonds.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.68928,61.7228,57.7347,2476.54,5.60594,5.59918,3.45753
std,0.195291,1.758879,2.467946,839.57562,0.625173,0.611974,0.389819
min,0.2,53.0,52.0,326.0,3.79,3.75,2.27
25%,0.7,60.9,56.0,2777.0,5.64,5.63,3.45
50%,0.71,61.8,57.0,2818.0,5.77,5.76,3.55
75%,0.79,62.6,59.0,2856.0,5.92,5.91,3.64
max,1.27,69.5,70.0,2898.0,7.12,7.05,4.33


In [26]:
# By changing the numeric types, we use about 62% of the memory. Note that we lose
# some precision, which may or may not be acceptable.

diamonds2.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.68928,61.722801,57.734699,2476.54,5.60594,5.59918,3.45753
std,0.195291,1.758878,2.467944,839.57562,0.625173,0.611972,0.389819
min,0.2,53.0,52.0,326.0,3.79,3.75,2.27
25%,0.7,60.900002,56.0,2777.0,5.64,5.63,3.45
50%,0.71,61.799999,57.0,2818.0,5.77,5.76,3.55
75%,0.79,62.599998,59.0,2856.0,5.92,5.91,3.64
max,1.27,69.5,70.0,2898.0,7.12,7.05,4.33


In [28]:
# Use the dtype parameter to use change object types to categoricals. First, inspect
# the .value_counts method of the object columns. If they are low cardinality, you
# can convert them to categorical columns to save even more memory:

diamonds2.cut.value_counts()

cut
Ideal        333
Premium      290
Very Good    226
Good          89
Fair          62
Name: count, dtype: int64

In [29]:
diamonds2.color.value_counts()

color
E    240
F    226
G    139
D    129
H    125
I     95
J     46
Name: count, dtype: int64

In [30]:
diamonds2.clarity.value_counts()

clarity
SI1     306
VS2     218
VS1     159
SI2     154
VVS2     62
VVS1     58
I1       29
IF       14
Name: count, dtype: int64

In [32]:
# Because these are of low cardinality, we can convert them to categoricals and use
# around 37% of the original size:

diamonds3 = pd.read_csv('../data/diamonds.csv', nrows=1000,
    dtype={'carat': np.float32, 'depth': np.float32,
           'table': np.float32, 'x': np.float32,
           'y': np.float32, 'z': np.float32,
           'price': np.int16,
           'cut': 'category', 'color': 'category',
           'clarity': 'category'})

In [33]:
diamonds3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    1000 non-null   float32 
 1   cut      1000 non-null   category
 2   color    1000 non-null   category
 3   clarity  1000 non-null   category
 4   depth    1000 non-null   float32 
 5   table    1000 non-null   float32 
 6   price    1000 non-null   int16   
 7   x        1000 non-null   float32 
 8   y        1000 non-null   float32 
 9   z        1000 non-null   float32 
dtypes: category(3), float32(6), int16(1)
memory usage: 29.4 KB


In [None]:
# If there are columns that we know we can ignore, we can use the usecols
# parameter to specify the columns we want to load. Here, we will ignore columns x, y,
# and z:

In [36]:
cols = ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price']
diamonds4 = pd.read_csv('../data/diamonds.csv', nrows=1000,
    dtype={'carat': np.float32, 'depth': np.float32,
           'table': np.float32, 'price': np.int16,
           'cut': 'category', 'color': 'category',
           'clarity': 'category'},
    usecols=cols)

In [37]:
diamonds4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    1000 non-null   float32 
 1   cut      1000 non-null   category
 2   color    1000 non-null   category
 3   clarity  1000 non-null   category
 4   depth    1000 non-null   float32 
 5   table    1000 non-null   float32 
 6   price    1000 non-null   int16   
dtypes: category(3), float32(3), int16(1)
memory usage: 17.6 KB


In [40]:
cols = ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price']
diamonds_iter = pd.read_csv('../data/diamonds.csv', nrows=1000,
    dtype={'carat': np.float32, 'depth': np.float32,
           'table': np.float32, 'price': np.int16,
           'cut': 'category', 'color': 'category',
           'clarity': 'category'},
    usecols=cols,
    chunksize=200)

In [41]:
def process(df):
    return f'processed {df.size} items'

In [42]:
for chunk in diamonds_iter:
    process(chunk)

In [None]:
# If the column turns out to be non-numeric, pandas will convert it to an object column, and
# treat the values as strings. String values in pandas take up a bunch of memory as each value
# is stored as a Python string. If we convert these to categoricals, pandas will use much less
# memory as it only stores the string once, rather than creating new strings (even if they repeat)
# for every row.

### How it works\...

### There\'s more \...

In [None]:
diamonds.price.memory_usage()

In [None]:
diamonds.price.memory_usage(index=False)

In [None]:
diamonds.cut.memory_usage()

In [None]:
diamonds.cut.memory_usage(deep=True)

In [None]:
diamonds4.to_feather('../tmp/d.arr')
diamonds5 = pd.read_feather('../tmp/d.arr')

In [None]:
diamonds4.to_parquet('../tmp/d.pqt')

### How to do it\...

In [None]:
import xlwt

In [None]:
beatles.to_excel('../tmp/beat.xlsx')

In [None]:
beatles.to_excel('../tmp/beat.xlsx')

In [None]:
beat2 = pd.read_excel('../tmp/beat.xlsx')
beat2

In [None]:
beat2 = pd.read_excel('../tmp/beat.xlsx', index_col=0)
beat2

In [None]:
beat2.dtypes

### How it works\...

### There\'s more\...

In [None]:
# How to write files to Excel

from openpyxl import Workbook

In [None]:
xl_writer = pd.ExcelWriter('../tmp/beat.xlsx')
beatles.to_excel(xl_writer, sheet_name='All')
beatles[beatles.birth < 1941].to_excel(xl_writer, sheet_name='1940')

### How to do it\...

In [None]:
autos = pd.read_csv('../data/vehicles.csv.zip', low_memory=False)
autos

In [None]:
autos.modifiedOn.dtype

In [None]:
autos.modifiedOn

In [None]:
pd.to_datetime(autos.modifiedOn)  # doctest: +SKIP

In [None]:
autos = pd.read_csv('data/vehicles.csv.zip',
    parse_dates=['modifiedOn'])  # doctest: +SKIP
autos.modifiedOn

In [None]:
import zipfile

In [None]:
with zipfile.ZipFile('data/kaggle-survey-2018.zip') as z:
    print('\n'.join(z.namelist()))
    kag = pd.read_csv(z.open('multipleChoiceResponses.csv'))
    kag_questions = kag.iloc[0]
    survey = kag.iloc[1:]

In [None]:
print(survey.head(2).T)

### How it works\...

### There\'s more\...

### How to do it\...

In [None]:
import sqlite3
con = sqlite3.connect('data/beat.db')
with con:
    cur = con.cursor()
    cur.execute("""DROP TABLE Band""")
    cur.execute("""CREATE TABLE Band(id INTEGER PRIMARY KEY,
        fname TEXT, lname TEXT, birthyear INT)""")
    cur.execute("""INSERT INTO Band VALUES(
        0, 'Paul', 'McCartney', 1942)""")
    cur.execute("""INSERT INTO Band VALUES(
        1, 'John', 'Lennon', 1940)""")
    _ = con.commit()

In [None]:
import sqlalchemy as sa
engine = sa.create_engine(
  'sqlite:///data/beat.db', echo=True)
sa_connection = engine.connect()

In [None]:
beat = pd.read_sql('Band', sa_connection, index_col='id')
beat

In [None]:
sql = '''SELECT fname, birthyear from Band'''
fnames = pd.read_sql(sql, con)
fnames

### How it work\'s\...

In [None]:
import json
encoded = json.dumps(people)
encoded

In [None]:
json.loads(encoded)

### How to do it\...

In [None]:
beatles = pd.read_json(encoded)
beatles

In [None]:
records = beatles.to_json(orient='records')
records

In [None]:
pd.read_json(records, orient='records')

In [None]:
split = beatles.to_json(orient='split')
split

In [None]:
pd.read_json(split, orient='split')

In [None]:
index = beatles.to_json(orient='index')
index

In [None]:
pd.read_json(index, orient='index')

In [None]:
values = beatles.to_json(orient='values')
values

In [None]:
pd.read_json(values, orient='values')

In [None]:
(pd.read_json(values, orient='values')
   .rename(columns=dict(enumerate(['first', 'last', 'birth'])))
)

In [None]:
table = beatles.to_json(orient='table')
table

In [None]:
pd.read_json(table, orient='table')

### How it works\...

### There\'s more\...

In [None]:
output = beat.to_dict()
output

In [None]:
output['version'] = '0.4.1'
json.dumps(output)

### How to do it\...

In [None]:
url ='https://en.wikipedia.org/wiki/The_Beatles_discography'
dfs = pd.read_html(url)
len(dfs)

In [None]:
dfs[0]

In [None]:
url ='https://en.wikipedia.org/wiki/The_Beatles_discography'
dfs = pd.read_html(url, match='List of studio albums', na_values='—')
len(dfs)

In [None]:
dfs[0].columns

In [None]:
url ='https://en.wikipedia.org/wiki/The_Beatles_discography'
dfs = pd.read_html(url, match='List of studio albums', na_values='—',
    header=[0,1])
len(dfs)

In [None]:
dfs[0]

In [None]:
dfs[0].columns

In [None]:
df = dfs[0]
df.columns = ['Title', 'Release', 'UK', 'AUS', 'CAN', 'FRA', 'GER',
    'NOR', 'US', 'Certifications']
df

In [None]:
res = (df
  .pipe(lambda df_: df_[~df_.Title.str.startswith('Released')])
  .iloc[:-1]
  .assign(release_date=lambda df_: pd.to_datetime(
             df_.Release.str.extract(r'Released: (.*) Label')
               [0]
               .str.replace(r'\[E\]', '')
          ),
          label=lambda df_:df_.Release.str.extract(r'Label: (.*)')
         )
   .loc[:, ['Title', 'UK', 'AUS', 'CAN', 'FRA', 'GER', 'NOR',
            'US', 'release_date', 'label']]
)
res

### How it works\...

### There is more\...

In [None]:
url = 'https://github.com/mattharrison/datasets/blob/master/data/anscombes.csv'
dfs = pd.read_html(url, attrs={'class': 'csv-data'})
len(dfs)

In [None]:
dfs[0]