# Agenda

1. Extension dtypes
2. Files
    - CSV
    - JSON
    - Excel
    - Other formats

In [2]:
import numpy as np
a = np.array([10, 20, 30, 40, 50])
a

array([10, 20, 30, 40, 50])

In [3]:
a.dtype

dtype('int64')

In [4]:
a[2] = 2345
a

array([  10,   20, 2345,   40,   50])

In [5]:
a[2] = 23.45
a

array([10, 20, 23, 40, 50])

In [6]:
a[3] = '123'
a

array([ 10,  20,  23, 123,  50])

In [7]:
a[4] = np.nan

ValueError: cannot convert float NaN to integer

In [9]:
import pandas as pd
from pandas import Series, DataFrame

In [10]:
s = Series([10, 20, 30, 40, 50])
s

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [11]:
s.loc[2] = 2345
s

0      10
1      20
2    2345
3      40
4      50
dtype: int64

In [12]:
s.loc[2] = 23.45


In [13]:
s

0    10.00
1    20.00
2    23.45
3    40.00
4    50.00
dtype: float64

In [14]:
s = Series([10, 20, 30, 40, 50])
s.loc[2] = np.nan

In [15]:
s

0    10.0
1    20.0
2     NaN
3    40.0
4    50.0
dtype: float64

In [16]:
s = Series('this is a test'.split())
s

0    this
1      is
2       a
3    test
dtype: object

In [17]:
s.loc[2] = np.nan

In [18]:
s

0    this
1      is
2     NaN
3    test
dtype: object

In [19]:
s.str.len()

0    4.0
1    2.0
2    NaN
3    4.0
dtype: float64

In [20]:
s.str.get(0)

0      t
1      i
2    NaN
3      t
dtype: object

In [21]:
s = Series([10, 20, 30, 40, 50],
          dtype=pd.Int64Dtype())

In [22]:
s

0    10
1    20
2    30
3    40
4    50
dtype: Int64

In [23]:
s.loc[2] = np.nan

In [24]:
s

0      10
1      20
2    <NA>
3      40
4      50
dtype: Int64

In [25]:
np.nan

nan

In [29]:
pd.NA

<NA>

In [31]:
df = DataFrame({'a':[10, 20, 30],
               'b':[11.1, 22.2, 33.3],
               'c':'hello out there'.split()})
df

Unnamed: 0,a,b,c
0,10,11.1,hello
1,20,22.2,out
2,30,33.3,there


In [32]:
df.dtypes

a      int64
b    float64
c     object
dtype: object

In [33]:
df.convert_dtypes()

Unnamed: 0,a,b,c
0,10,11.1,hello
1,20,22.2,out
2,30,33.3,there


In [34]:
df.convert_dtypes().dtypes

a             Int64
b           Float64
c    string[python]
dtype: object

In [35]:
df = df.convert_dtypes()

In [36]:
df.loc[2] = [pd.NA, pd.NA, pd.NA]

In [37]:
df

Unnamed: 0,a,b,c
0,10.0,11.1,hello
1,20.0,22.2,out
2,,,


In [38]:
df.dtypes

a             Int64
b           Float64
c    string[python]
dtype: object

In [39]:
df = DataFrame({'a':[10, 20, 30],
               'b':[11.1, 22.2, 33.3],
               'c':'hello out there'.split()})
df = df.convert_dtypes()

In [40]:
df['b'] = df['b'].astype(np.float64)
df.dtypes

a             Int64
b           float64
c    string[python]
dtype: object

In [41]:
df.loc[0, 'a'] = 12.34

TypeError: Invalid value '12.34' for dtype Int64

In [50]:
%xmode Minimal

Exception reporting mode: Minimal


In [51]:
df.loc[0, 'a'] = 12.34

TypeError: Invalid value '12.34' for dtype Int64

In [53]:
%tb Plain

TypeError: Invalid value '12.34' for dtype Int64

In [54]:
df

Unnamed: 0,a,b,c
0,10,11.1,hello
1,20,22.2,out
2,30,33.3,there


In [55]:
df.to_clipboard()

In [56]:
df.to_dict()

{'a': {0: 10, 1: 20, 2: 30},
 'b': {0: 11.1, 1: 22.2, 2: 33.3},
 'c': {0: 'hello', 1: 'out', 2: 'there'}}

In [57]:
df

Unnamed: 0,a,b,c
0,10,11.1,hello
1,20,22.2,out
2,30,33.3,there


In [58]:
df.loc[3] = [40, 44.4, 'hi']

In [59]:
df

Unnamed: 0,a,b,c
0,10,11.1,hello
1,20,22.2,out
2,30,33.3,there
3,40,44.4,hi


In [60]:
df.index = [0, 1, 2, 2]

In [61]:
df

Unnamed: 0,a,b,c
0,10,11.1,hello
1,20,22.2,out
2,30,33.3,there
2,40,44.4,hi


In [62]:
df.to_dict()

{'a': {0: 10, 1: 20, 2: 40},
 'b': {0: 11.1, 1: 22.2, 2: 44.4},
 'c': {0: 'hello', 1: 'out', 2: 'hi'}}

In [64]:
print(df.to_xml())

<?xml version='1.0' encoding='utf-8'?>
<data>
  <row>
    <index>0</index>
    <a>10</a>
    <b>11.1</b>
    <c>hello</c>
  </row>
  <row>
    <index>1</index>
    <a>20</a>
    <b>22.2</b>
    <c>out</c>
  </row>
  <row>
    <index>2</index>
    <a>30</a>
    <b>33.3</b>
    <c>there</c>
  </row>
  <row>
    <index>2</index>
    <a>40</a>
    <b>44.4</b>
    <c>hi</c>
  </row>
</data>


In [66]:
df.to_csv('mydata.csv')

In [67]:
!cat mydata.csv

,a,b,c
0,10,11.1,hello
1,20,22.2,out
2,30,33.3,there
2,40,44.4,hi


In [68]:
help(df.to_csv)

Help on method to_csv in module pandas.core.generic:

to_csv(path_or_buf: 'FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None' = None, sep: 'str' = ',', na_rep: 'str' = '', float_format: 'str | Callable | None' = None, columns: 'Sequence[Hashable] | None' = None, header: 'bool_t | list[str]' = True, index: 'bool_t' = True, index_label: 'IndexLabel | None' = None, mode: 'str' = 'w', encoding: 'str | None' = None, compression: 'CompressionOptions' = 'infer', quoting: 'int | None' = None, quotechar: 'str' = '"', lineterminator: 'str | None' = None, chunksize: 'int | None' = None, date_format: 'str | None' = None, doublequote: 'bool_t' = True, escapechar: 'str | None' = None, decimal: 'str' = '.', errors: 'str' = 'strict', storage_options: 'StorageOptions' = None) -> 'str | None' method of pandas.core.frame.DataFrame instance
    Write object to a comma-separated values (csv) file.
    
    Parameters
    ----------
    path_or_buf : str, path object, file-like object, or None, defaul

In [69]:
df.to_csv('mydata.csv', sep='\t')

In [70]:
!cat mydata.csv

	a	b	c
0	10	11.1	hello
1	20	22.2	out
2	30	33.3	there
2	40	44.4	hi


In [74]:
df.iloc[:-1].to_json('mydata.json')

In [75]:
!cat mydata.json

{"a":{"0":10,"1":20,"2":30},"b":{"0":11.1,"1":22.2,"2":33.3},"c":{"0":"hello","1":"out","2":"there"}}

In [76]:
help(df.to_json)

Help on method to_json in module pandas.core.generic:

to_json(path_or_buf: 'FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None' = None, orient: 'str | None' = None, date_format: 'str | None' = None, double_precision: 'int' = 10, force_ascii: 'bool_t' = True, date_unit: 'str' = 'ms', default_handler: 'Callable[[Any], JSONSerializable] | None' = None, lines: 'bool_t' = False, compression: 'CompressionOptions' = 'infer', index: 'bool_t' = True, indent: 'int | None' = None, storage_options: 'StorageOptions' = None, mode: "Literal['a', 'w']" = 'w') -> 'str | None' method of pandas.core.frame.DataFrame instance
    Convert the object to a JSON string.
    
    Note NaN's and None will be converted to null and datetime objects
    will be converted to UNIX timestamps.
    
    Parameters
    ----------
    path_or_buf : str, path object, file-like object, or None, default None
        String, path object (implementing os.PathLike[str]), or file-like
        object implementing a write()

In [77]:
df.to_excel('mydata.xlsx')

In [78]:
!open mydata.xlsx

In [80]:
df.reset_index().to_feather('mydata.feather')

In [81]:
df.to_parquet('mydata.parquet')

In [82]:
!ls -l mydata.*

-rw-r--r-- 1 reuven staff   66 Jun 22 10:03 mydata.csv
-rw-r--r-- 1 reuven staff 2746 Jun 22 10:09 mydata.feather
-rw-r--r-- 1 reuven staff  101 Jun 22 10:04 mydata.json
-rw-r--r-- 1 reuven staff 3423 Jun 22 10:09 mydata.parquet
-rw-r--r-- 1 reuven staff 5031 Jun 22 10:06 mydata.xlsx


In [90]:
df = DataFrame(np.random.randint(0, 1000, [10_000, 10]),
              columns=list('abcdefghij'))

In [91]:
df.to_parquet('bigdata.parquet')

In [92]:
df.to_feather('bigdata.feather')

In [95]:
!ls -l bigdata*

-rw-r--r-- 1 reuven staff 437850 Jun 22 10:14 bigdata.csv
-rw-r--r-- 1 reuven staff 325322 Jun 22 10:14 bigdata.feather
-rw-r--r-- 1 reuven staff 172358 Jun 22 10:14 bigdata.parquet


In [94]:
df.to_csv('bigdata.csv')

In [98]:
%timeit df = pd.read_csv('bigdata.csv')

5.56 ms ± 579 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [100]:
%timeit df = pd.read_feather('bigdata.feather')

1.3 ms ± 3.14 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [101]:
%timeit df = pd.read_parquet('bigdata.parquet')

2.04 ms ± 133 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [97]:
df.head()

Unnamed: 0.1,Unnamed: 0,a,b,c,d,e,f,g,h,i,j
0,0,982,787,642,102,149,267,860,204,986,546
1,1,92,805,130,96,66,915,977,587,52,218
2,2,775,200,791,960,469,906,478,122,687,977
3,3,846,410,53,699,680,438,818,453,308,765
4,4,890,666,936,654,568,530,1,773,402,27


In [102]:
!wget https://files.lerner.co.il/data-science-exercise-files.zip

--2023-06-22 10:17:46--  https://files.lerner.co.il/data-science-exercise-files.zip
Resolving files.lerner.co.il (files.lerner.co.il)... 64.227.9.246
Connecting to files.lerner.co.il (files.lerner.co.il)|64.227.9.246|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1561985 (1.5M) [application/zip]
Saving to: ‘data-science-exercise-files.zip’


2023-06-22 10:17:49 (882 KB/s) - ‘data-science-exercise-files.zip’ saved [1561985/1561985]



In [103]:
!unzip data-science-exercise-files.zip

Archive:  data-science-exercise-files.zip
 extracting: airports.zip            
  inflating: taxi.csv                
  inflating: burrito_current.csv     
   creating: airports/
  inflating: titanic3.csv            
  inflating: celebrity_deaths_2016.csv  
  inflating: languages.csv           
  inflating: airlines.dat            


In [104]:
!ls *.csv

bigdata.csv	     celebrity_deaths_2016.csv	mydata.csv  titanic3.csv
burrito_current.csv  languages.csv		taxi.csv


In [105]:
%pwd

'/Users/reuven/Courses/Current/Apple-2023-06June-analytics'

In [106]:
!head taxi.csv

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
2,2015-06-02 11:19:29,2015-06-02 11:47:52,1,1.63,-73.954429626464844,40.764141082763672,1,N,-73.974754333496094,40.754093170166016,2,17,0,0.5,0,0,0.3,17.8
2,2015-06-02 11:19:30,2015-06-02 11:27:56,1,.46,-73.971443176269531,40.758941650390625,1,N,-73.978538513183594,40.761909484863281,1,6.5,0,0.5,1,0,0.3,8.3
2,2015-06-02 11:19:31,2015-06-02 11:30:30,1,.87,-73.978111267089844,40.738433837890625,1,N,-73.990272521972656,40.745437622070313,1,8,0,0.5,2.2,0,0.3,11
2,2015-06-02 11:19:31,2015-06-02 11:39:02,1,2.13,-73.945892333984375,40.773529052734375,1,N,-73.971527099609375,40.760330200195312,1,13.5,0,0.5,2.86,0,0.3,17.16
1,2015-06-02 11:19:32,2015-06-02 11:32:49,1,1.40,-73.979087829589844,40.776771545410156,1

# Exercise: Taxi rides

1. Use `pd.read_csv` to read `taxi.csv` into a data frame.
2. Were there any rides with 0 passengers? Show the mean `trip_distance` and `total_amount` for such rides.
3. Were there any rides where the `total_amount` was <= 0? If so, show how far they went, on average.
4. Were there any rides where the `trip_distance` was 0? If so, how much did they pay, on average?

In [118]:
df = pd.read_csv('taxi.csv')

In [108]:
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2015-06-02 11:19:29,2015-06-02 11:47:52,1,1.63,-73.95443,40.764141,1,N,-73.974754,40.754093,2,17.0,0.0,0.5,0.0,0.0,0.3,17.8
1,2,2015-06-02 11:19:30,2015-06-02 11:27:56,1,0.46,-73.971443,40.758942,1,N,-73.978539,40.761909,1,6.5,0.0,0.5,1.0,0.0,0.3,8.3
2,2,2015-06-02 11:19:31,2015-06-02 11:30:30,1,0.87,-73.978111,40.738434,1,N,-73.990273,40.745438,1,8.0,0.0,0.5,2.2,0.0,0.3,11.0
3,2,2015-06-02 11:19:31,2015-06-02 11:39:02,1,2.13,-73.945892,40.773529,1,N,-73.971527,40.76033,1,13.5,0.0,0.5,2.86,0.0,0.3,17.16
4,1,2015-06-02 11:19:32,2015-06-02 11:32:49,1,1.4,-73.979088,40.776772,1,N,-73.982162,40.758999,2,9.5,0.0,0.5,0.0,0.0,0.3,10.3


In [111]:
# Were there any rides with 0 passengers? 
# Show the mean trip_distance and total_amount for such rides.

df.loc[
    df['passenger_count'] == 0   # row selector
    ,
  ['trip_distance', 'total_amount']    # column selector
].mean()

trip_distance     4.60
total_amount     25.57
dtype: float64

In [113]:
# Were there any rides with 0 passengers? 
# Show the mean trip_distance and total_amount for such rides.

df.loc[df['passenger_count'] == 0][['trip_distance', 'total_amount']]

Unnamed: 0,trip_distance,total_amount
5097,1.3,14.75
8313,7.9,36.39


In [116]:
# Were there any rides where the total_amount was <= 0? 
# If so, show how far they went, on average.

df.loc[
    df['total_amount'] <= 0   # row selector
    ,
    'trip_distance'    # column selector
    
].mean()

0.6066666666666667

In [121]:
# Were there any rides where the trip_distance was 0?
# If so, how much did they pay, on average?

df.loc[
    df['trip_distance'] == 0    # row selector
    ,
    'total_amount'
].mean()

31.581940298507465

In [122]:
df.loc[
    df['trip_distance'] == 0    # row selector
    ,
    ['pickup_longitude','pickup_latitude',
     'dropoff_longitude','dropoff_latitude']
     
]

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
149,-73.978493,40.748562,-73.978493,40.748604
246,-73.983200,40.766949,-73.990410,40.766872
297,-73.937851,40.758236,0.000000,0.000000
657,-73.996460,40.732124,-73.996429,40.732147
660,0.000000,0.000000,-73.749077,40.707611
...,...,...,...,...
9016,-74.007530,40.740753,0.000000,0.000000
9087,-73.971931,40.791393,-73.971939,40.791382
9093,0.000000,0.000000,0.000000,0.000000
9740,-74.005714,40.740582,-74.005417,40.740803


In [124]:
df = pd.read_csv('taxi.csv')
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2015-06-02 11:19:29,2015-06-02 11:47:52,1,1.63,-73.95443,40.764141,1,N,-73.974754,40.754093,2,17.0,0.0,0.5,0.0,0.0,0.3,17.8
1,2,2015-06-02 11:19:30,2015-06-02 11:27:56,1,0.46,-73.971443,40.758942,1,N,-73.978539,40.761909,1,6.5,0.0,0.5,1.0,0.0,0.3,8.3
2,2,2015-06-02 11:19:31,2015-06-02 11:30:30,1,0.87,-73.978111,40.738434,1,N,-73.990273,40.745438,1,8.0,0.0,0.5,2.2,0.0,0.3,11.0
3,2,2015-06-02 11:19:31,2015-06-02 11:39:02,1,2.13,-73.945892,40.773529,1,N,-73.971527,40.76033,1,13.5,0.0,0.5,2.86,0.0,0.3,17.16
4,1,2015-06-02 11:19:32,2015-06-02 11:32:49,1,1.4,-73.979088,40.776772,1,N,-73.982162,40.758999,2,9.5,0.0,0.5,0.0,0.0,0.3,10.3


In [125]:
# usecols == list of columns that I do want
# use names or numeric indexes

df = pd.read_csv('taxi.csv',
                usecols=['passenger_count', 'trip_distance', 'total_amount'])

In [126]:
df.head()

Unnamed: 0,passenger_count,trip_distance,total_amount
0,1,1.63,17.8
1,1,0.46,8.3
2,1,0.87,11.0
3,1,2.13,17.16
4,1,1.4,10.3


In [127]:
# df.info -- tells us about the data frame

df = pd.read_csv('taxi.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   VendorID               9999 non-null   int64  
 1   tpep_pickup_datetime   9999 non-null   object 
 2   tpep_dropoff_datetime  9999 non-null   object 
 3   passenger_count        9999 non-null   int64  
 4   trip_distance          9999 non-null   float64
 5   pickup_longitude       9999 non-null   float64
 6   pickup_latitude        9999 non-null   float64
 7   RateCodeID             9999 non-null   int64  
 8   store_and_fwd_flag     9999 non-null   object 
 9   dropoff_longitude      9999 non-null   float64
 10  dropoff_latitude       9999 non-null   float64
 11  payment_type           9999 non-null   int64  
 12  fare_amount            9999 non-null   float64
 13  extra                  9999 non-null   float64
 14  mta_tax                9999 non-null   float64
 15  tip_

In [128]:
df = pd.read_csv('taxi.csv',
                 usecols=['passenger_count',
                         'trip_distance',
                         'total_amount'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   passenger_count  9999 non-null   int64  
 1   trip_distance    9999 non-null   float64
 2   total_amount     9999 non-null   float64
dtypes: float64(2), int64(1)
memory usage: 234.5 KB


In [130]:
!ls -lh ~/Courses/Current/Data/*nyc*

-rw-r--r-- 1 reuven staff 2.2G Jul 29  2021 /Users/reuven/Courses/Current/Data/nyc-parking-violations-2020.csv
-rw-r--r-- 1 reuven staff 1.9K Jul 29  2021 /Users/reuven/Courses/Current/Data/nyc-temps.txt
-rw-r--r-- 1 reuven staff 656M Jul 29  2021 /Users/reuven/Courses/Current/Data/nyc_taxi_2019-01.csv
-rw-r--r-- 1 reuven staff 558M Jul 29  2021 /Users/reuven/Courses/Current/Data/nyc_taxi_2019-07.csv
-rw-r--r-- 1 reuven staff 567M Jul 29  2021 /Users/reuven/Courses/Current/Data/nyc_taxi_2020-01.csv
-rw-r--r-- 1 reuven staff  70M Jul 29  2021 /Users/reuven/Courses/Current/Data/nyc_taxi_2020-07.csv


In [131]:
df = pd.read_csv('taxi.csv',
                 usecols=['passenger_count',
                         'trip_distance',
                         'total_amount'],
                dtype={'passenger_count':np.int8,
                      'trip_distance':np.float16,
                      'total_amount':np.float16})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   passenger_count  9999 non-null   int8   
 1   trip_distance    9999 non-null   float16
 2   total_amount     9999 non-null   float16
dtypes: float16(2), int8(1)
memory usage: 49.0 KB


In [133]:
df = pd.read_csv('taxi.csv',
                 usecols=['passenger_count',
                         'trip_distance',
                         'total_amount'],
                dtype={'passenger_count':pd.Int8Dtype(),
                      'trip_distance':pd.Float32Dtype(),
                      'total_amount':pd.Float32Dtype()})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   passenger_count  9999 non-null   Int8   
 1   trip_distance    9999 non-null   Float32
 2   total_amount     9999 non-null   Float32
dtypes: Float32(2), Int8(1)
memory usage: 117.3 KB


In [134]:
df = pd.read_csv('taxi.csv',
                 usecols=['passenger_count',
                         'trip_distance',
                         'total_amount'],
                dtype={'passenger_count':np.int8,
                      'trip_distance':np.float32,
                      'total_amount':np.float32})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   passenger_count  9999 non-null   int8   
 1   trip_distance    9999 non-null   float32
 2   total_amount     9999 non-null   float32
dtypes: float32(2), int8(1)
memory usage: 88.0 KB


In [136]:
df = pd.read_csv('taxi.csv',
                 usecols=['tpep_pickup_datetime',
                          'passenger_count',
                         'trip_distance',
                         'total_amount'],
                index_col='tpep_pickup_datetime')
df.head()

Unnamed: 0_level_0,passenger_count,trip_distance,total_amount
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-06-02 11:19:29,1,1.63,17.8
2015-06-02 11:19:30,1,0.46,8.3
2015-06-02 11:19:31,1,0.87,11.0
2015-06-02 11:19:31,1,2.13,17.16
2015-06-02 11:19:32,1,1.4,10.3


# `pd.read_csv` options

- `sep`  -- separator character, default is `,`
- `usecols` -- which columns do you want to include?
- `index_col` -- which column(s) do you want to be the index?
- `names` -- list of names to give the columns
- `header` -- on what line are the headers? Or `None` if there are no headers

In [138]:
DataFrame(np.random.randint(0, 100, [5,5])).to_csv('randomstuff.csv')

In [142]:
!cat randomstuff.csv

0,97,5,9,59,76
1,16,95,40,88,56
2,17,75,51,71,1
3,90,88,87,43,86
4,53,74,33,62,18


In [153]:
pd.read_csv('randomstuff.csv',
           header=None,
           index_col=0,
            names=list('abcde'))

Unnamed: 0,a,b,c,d,e
0,97,5,9,59,76
1,16,95,40,88,56
2,17,75,51,71,1
3,90,88,87,43,86
4,53,74,33,62,18


In [154]:
!head -20 /etc/passwd

##
# User Database
# 
# Note that this file is consulted directly only when the system is running
# in single-user mode.  At other times this information is provided by
# Open Directory.
#
# See the opendirectoryd(8) man page for additional information about
# Open Directory.
##
nobody:*:-2:-2:Unprivileged User:/var/empty:/usr/bin/false
root:*:0:0:System Administrator:/var/root:/bin/sh
daemon:*:1:1:System Services:/var/root:/usr/bin/false
_uucp:*:4:4:Unix to Unix Copy Protocol:/var/spool/uucp:/usr/sbin/uucico
_taskgated:*:13:13:Task Gate Daemon:/var/empty:/usr/bin/false
_networkd:*:24:24:Network Services:/var/networkd:/usr/bin/false
_installassistant:*:25:25:Install Assistant:/var/empty:/usr/bin/false
_lp:*:26:26:Printing Services:/var/spool/cups:/usr/bin/false
_postfix:*:27:27:Postfix Mail Server:/var/spool/postfix:/usr/bin/false
_scsd:*:31:31:Service Configuration Service:/var/empty:/usr/bin/false


In [159]:
pd.read_csv('/etc/passwd', 
            sep=':', 
            comment='#',
           header=None,
           names='username passwd uid gid name homedir shell'.split(),
           index_col='username')

Unnamed: 0_level_0,passwd,uid,gid,name,homedir,shell
username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
nobody,*,-2,-2,Unprivileged User,/var/empty,/usr/bin/false
root,*,0,0,System Administrator,/var/root,/bin/sh
daemon,*,1,1,System Services,/var/root,/usr/bin/false
_uucp,*,4,4,Unix to Unix Copy Protocol,/var/spool/uucp,/usr/sbin/uucico
_taskgated,*,13,13,Task Gate Daemon,/var/empty,/usr/bin/false
...,...,...,...,...,...,...
_notification_proxy,*,285,285,Notification Proxy,/var/empty,/usr/bin/false
_avphidbridge,*,288,288,Apple Virtual Platform HID Bridge,/var/empty,/usr/bin/false
_biome,*,289,289,Biome,/var/db/biome,/usr/bin/false
_backgroundassets,*,291,291,Background Assets Service,/var/empty,/usr/bin/false


In [160]:
help(pd.read_csv)

Help on function read_csv in module pandas.io.parsers.readers:

read_csv(filepath_or_buffer: 'FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str]', *, sep: 'str | None | lib.NoDefault' = <no_default>, delimiter: 'str | None | lib.NoDefault' = None, header: "int | Sequence[int] | None | Literal['infer']" = 'infer', names: 'Sequence[Hashable] | None | lib.NoDefault' = <no_default>, index_col: 'IndexLabel | Literal[False] | None' = None, usecols=None, dtype: 'DtypeArg | None' = None, engine: 'CSVEngine | None' = None, converters=None, true_values=None, false_values=None, skipinitialspace: 'bool' = False, skiprows=None, skipfooter: 'int' = 0, nrows: 'int | None' = None, na_values=None, keep_default_na: 'bool' = True, na_filter: 'bool' = True, verbose: 'bool' = False, skip_blank_lines: 'bool' = True, parse_dates: 'bool | Sequence[Hashable] | None' = None, infer_datetime_format: 'bool | lib.NoDefault' = <no_default>, keep_date_col: 'bool' = False, date_parser=<no_default>, date_format: 'str

In [162]:
%timeit pd.read_csv('taxi.csv')

20.6 ms ± 761 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [164]:
%timeit pd.read_csv('taxi.csv', engine='pyarrow')

4.12 ms ± 63.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [165]:
help(pd.read_csv)

Help on function read_csv in module pandas.io.parsers.readers:

read_csv(filepath_or_buffer: 'FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str]', *, sep: 'str | None | lib.NoDefault' = <no_default>, delimiter: 'str | None | lib.NoDefault' = None, header: "int | Sequence[int] | None | Literal['infer']" = 'infer', names: 'Sequence[Hashable] | None | lib.NoDefault' = <no_default>, index_col: 'IndexLabel | Literal[False] | None' = None, usecols=None, dtype: 'DtypeArg | None' = None, engine: 'CSVEngine | None' = None, converters=None, true_values=None, false_values=None, skipinitialspace: 'bool' = False, skiprows=None, skipfooter: 'int' = 0, nrows: 'int | None' = None, na_values=None, keep_default_na: 'bool' = True, na_filter: 'bool' = True, verbose: 'bool' = False, skip_blank_lines: 'bool' = True, parse_dates: 'bool | Sequence[Hashable] | None' = None, infer_datetime_format: 'bool | lib.NoDefault' = <no_default>, keep_date_col: 'bool' = False, date_parser=<no_default>, date_format: 'str