# Agenda

1. Extension dtypes
2. Files
    - CSV
    - JSON
    - Excel
    - Other formats

In [2]:
import numpy as np
a = np.array([10, 20, 30, 40, 50])
a

array([10, 20, 30, 40, 50])

In [3]:
a.dtype

dtype('int64')

In [4]:
a[2] = 2345
a

array([  10,   20, 2345,   40,   50])

In [5]:
a[2] = 23.45
a

array([10, 20, 23, 40, 50])

In [6]:
a[3] = '123'
a

array([ 10,  20,  23, 123,  50])

In [7]:
a[4] = np.nan

ValueError: cannot convert float NaN to integer

In [9]:
import pandas as pd
from pandas import Series, DataFrame

In [10]:
s = Series([10, 20, 30, 40, 50])
s

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [11]:
s.loc[2] = 2345
s

0      10
1      20
2    2345
3      40
4      50
dtype: int64

In [12]:
s.loc[2] = 23.45


In [13]:
s

0    10.00
1    20.00
2    23.45
3    40.00
4    50.00
dtype: float64

In [14]:
s = Series([10, 20, 30, 40, 50])
s.loc[2] = np.nan

In [15]:
s

0    10.0
1    20.0
2     NaN
3    40.0
4    50.0
dtype: float64

In [16]:
s = Series('this is a test'.split())
s

0    this
1      is
2       a
3    test
dtype: object

In [17]:
s.loc[2] = np.nan

In [18]:
s

0    this
1      is
2     NaN
3    test
dtype: object

In [19]:
s.str.len()

0    4.0
1    2.0
2    NaN
3    4.0
dtype: float64

In [20]:
s.str.get(0)

0      t
1      i
2    NaN
3      t
dtype: object

In [21]:
s = Series([10, 20, 30, 40, 50],
          dtype=pd.Int64Dtype())

In [22]:
s

0    10
1    20
2    30
3    40
4    50
dtype: Int64

In [23]:
s.loc[2] = np.nan

In [24]:
s

0      10
1      20
2    <NA>
3      40
4      50
dtype: Int64

In [25]:
np.nan

nan

In [29]:
pd.NA

<NA>

In [31]:
df = DataFrame({'a':[10, 20, 30],
               'b':[11.1, 22.2, 33.3],
               'c':'hello out there'.split()})
df

Unnamed: 0,a,b,c
0,10,11.1,hello
1,20,22.2,out
2,30,33.3,there


In [32]:
df.dtypes

a      int64
b    float64
c     object
dtype: object

In [33]:
df.convert_dtypes()

Unnamed: 0,a,b,c
0,10,11.1,hello
1,20,22.2,out
2,30,33.3,there


In [34]:
df.convert_dtypes().dtypes

a             Int64
b           Float64
c    string[python]
dtype: object

In [35]:
df = df.convert_dtypes()

In [36]:
df.loc[2] = [pd.NA, pd.NA, pd.NA]

In [37]:
df

Unnamed: 0,a,b,c
0,10.0,11.1,hello
1,20.0,22.2,out
2,,,


In [38]:
df.dtypes

a             Int64
b           Float64
c    string[python]
dtype: object

In [39]:
df = DataFrame({'a':[10, 20, 30],
               'b':[11.1, 22.2, 33.3],
               'c':'hello out there'.split()})
df = df.convert_dtypes()

In [40]:
df['b'] = df['b'].astype(np.float64)
df.dtypes

a             Int64
b           float64
c    string[python]
dtype: object

In [41]:
df.loc[0, 'a'] = 12.34

TypeError: Invalid value '12.34' for dtype Int64

In [50]:
%xmode Minimal

Exception reporting mode: Minimal


In [51]:
df.loc[0, 'a'] = 12.34

TypeError: Invalid value '12.34' for dtype Int64

In [53]:
%tb Plain

TypeError: Invalid value '12.34' for dtype Int64

In [54]:
df

Unnamed: 0,a,b,c
0,10,11.1,hello
1,20,22.2,out
2,30,33.3,there


In [55]:
df.to_clipboard()

In [56]:
df.to_dict()

{'a': {0: 10, 1: 20, 2: 30},
 'b': {0: 11.1, 1: 22.2, 2: 33.3},
 'c': {0: 'hello', 1: 'out', 2: 'there'}}

In [57]:
df

Unnamed: 0,a,b,c
0,10,11.1,hello
1,20,22.2,out
2,30,33.3,there


In [58]:
df.loc[3] = [40, 44.4, 'hi']

In [59]:
df

Unnamed: 0,a,b,c
0,10,11.1,hello
1,20,22.2,out
2,30,33.3,there
3,40,44.4,hi


In [60]:
df.index = [0, 1, 2, 2]

In [61]:
df

Unnamed: 0,a,b,c
0,10,11.1,hello
1,20,22.2,out
2,30,33.3,there
2,40,44.4,hi


In [62]:
df.to_dict()

{'a': {0: 10, 1: 20, 2: 40},
 'b': {0: 11.1, 1: 22.2, 2: 44.4},
 'c': {0: 'hello', 1: 'out', 2: 'hi'}}

In [64]:
print(df.to_xml())

<?xml version='1.0' encoding='utf-8'?>
<data>
  <row>
    <index>0</index>
    <a>10</a>
    <b>11.1</b>
    <c>hello</c>
  </row>
  <row>
    <index>1</index>
    <a>20</a>
    <b>22.2</b>
    <c>out</c>
  </row>
  <row>
    <index>2</index>
    <a>30</a>
    <b>33.3</b>
    <c>there</c>
  </row>
  <row>
    <index>2</index>
    <a>40</a>
    <b>44.4</b>
    <c>hi</c>
  </row>
</data>


In [66]:
df.to_csv('mydata.csv')

In [67]:
!cat mydata.csv

,a,b,c
0,10,11.1,hello
1,20,22.2,out
2,30,33.3,there
2,40,44.4,hi


In [68]:
help(df.to_csv)

Help on method to_csv in module pandas.core.generic:

to_csv(path_or_buf: 'FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None' = None, sep: 'str' = ',', na_rep: 'str' = '', float_format: 'str | Callable | None' = None, columns: 'Sequence[Hashable] | None' = None, header: 'bool_t | list[str]' = True, index: 'bool_t' = True, index_label: 'IndexLabel | None' = None, mode: 'str' = 'w', encoding: 'str | None' = None, compression: 'CompressionOptions' = 'infer', quoting: 'int | None' = None, quotechar: 'str' = '"', lineterminator: 'str | None' = None, chunksize: 'int | None' = None, date_format: 'str | None' = None, doublequote: 'bool_t' = True, escapechar: 'str | None' = None, decimal: 'str' = '.', errors: 'str' = 'strict', storage_options: 'StorageOptions' = None) -> 'str | None' method of pandas.core.frame.DataFrame instance
    Write object to a comma-separated values (csv) file.
    
    Parameters
    ----------
    path_or_buf : str, path object, file-like object, or None, defaul

In [69]:
df.to_csv('mydata.csv', sep='\t')

In [70]:
!cat mydata.csv

	a	b	c
0	10	11.1	hello
1	20	22.2	out
2	30	33.3	there
2	40	44.4	hi


In [74]:
df.iloc[:-1].to_json('mydata.json')

In [75]:
!cat mydata.json

{"a":{"0":10,"1":20,"2":30},"b":{"0":11.1,"1":22.2,"2":33.3},"c":{"0":"hello","1":"out","2":"there"}}

In [76]:
help(df.to_json)

Help on method to_json in module pandas.core.generic:

to_json(path_or_buf: 'FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None' = None, orient: 'str | None' = None, date_format: 'str | None' = None, double_precision: 'int' = 10, force_ascii: 'bool_t' = True, date_unit: 'str' = 'ms', default_handler: 'Callable[[Any], JSONSerializable] | None' = None, lines: 'bool_t' = False, compression: 'CompressionOptions' = 'infer', index: 'bool_t' = True, indent: 'int | None' = None, storage_options: 'StorageOptions' = None, mode: "Literal['a', 'w']" = 'w') -> 'str | None' method of pandas.core.frame.DataFrame instance
    Convert the object to a JSON string.
    
    Note NaN's and None will be converted to null and datetime objects
    will be converted to UNIX timestamps.
    
    Parameters
    ----------
    path_or_buf : str, path object, file-like object, or None, default None
        String, path object (implementing os.PathLike[str]), or file-like
        object implementing a write()

In [77]:
df.to_excel('mydata.xlsx')

In [78]:
!open mydata.xlsx

In [80]:
df.reset_index().to_feather('mydata.feather')

In [81]:
df.to_parquet('mydata.parquet')

In [82]:
!ls -l mydata.*

-rw-r--r-- 1 reuven staff   66 Jun 22 10:03 mydata.csv
-rw-r--r-- 1 reuven staff 2746 Jun 22 10:09 mydata.feather
-rw-r--r-- 1 reuven staff  101 Jun 22 10:04 mydata.json
-rw-r--r-- 1 reuven staff 3423 Jun 22 10:09 mydata.parquet
-rw-r--r-- 1 reuven staff 5031 Jun 22 10:06 mydata.xlsx


In [83]:
!cat mydata.feather

ARROW1  ����        
     
          
      
                     �     �  {"index_columns": [{"kind": "range", "name": null, "start": 0, "stop": 4, "step": 1}], "column_indexes": [{"name": null, "field_name": null, "pandas_type": "unicode", "numpy_type": "object", "metadata": {"encoding": "UTF-8"}}], "columns": [{"name": "index", "field_name": "index", "pandas_type": "int64", "numpy_type": "int64", "metadata": null}, {"name": "a", "field_name": "a", "pandas_type": "int64", "numpy_type": "Int64", "metadata": null}, {"name": "b", "field_name": "b", "pandas_type": "float64", "numpy_type": "float64", "metadata": null}, {"name": "c", "field_name": "c", "pandas_type": "unicode", "numpy_type": "object", "metadata": null}], "creator": {"library": "pyarrow", "version": "12.0.0"}, "pandas_version": "2.0.2"}     pandas     �   `   0      ����                  c        ����                  b          ����                  a   ����   @   

In [84]:
!cat mydata.parquet

PAR1@8L    
 	 	@        (        ,(       
        ((       
          
$   � &�5 a��&\&(       
        ((       
        ,      @<L     3&@H36@fffff�@@333333F@ ,333333F@333333&@ (333333F@333333&@   
$   � &�
5 b��&�&�333333F@333333&@ (333333F@333333&@ ,      >BL   x   hello   out   there   hi ,6 (therehello   
$   � &�5 c��&�&�6 (therehello ,      0,L     <               ,                (                  
$   � &�
5 __index_level_0__��&�	&�	                (                ,      \5 schema %a 
%b %c% L   %__index_level_0__ L&�5 a��&\&(       
        ((       
        ,      &�
5 b

In [90]:
df = DataFrame(np.random.randint(0, 1000, [10_000, 10]),
              columns=list('abcdefghij'))

In [91]:
df.to_parquet('bigdata.parquet')

In [92]:
df.to_feather('bigdata.feather')

In [95]:
!ls -l bigdata*

-rw-r--r-- 1 reuven staff 437850 Jun 22 10:14 bigdata.csv
-rw-r--r-- 1 reuven staff 325322 Jun 22 10:14 bigdata.feather
-rw-r--r-- 1 reuven staff 172358 Jun 22 10:14 bigdata.parquet


In [94]:
df.to_csv('bigdata.csv')