In [1]:
import pandas as pd;
import numpy as np;

## parsing
- In computing, parsing typically refers to the analysis of textual data, often in the form of code or language syntax, to extract relevant information or convert it into a structured representation that can be used by a computer program. 

### parse_dates
- bool, list of Hashable, list of lists or dict of {Hashablelist}, default False

The behavior is as follows:

- A boolean value:

    - True: Pandas will attempt to parse all columns with datetime-like values as datetime objects.
    - False (default): No datetime parsing will be performed.

- list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 each as a separate date column.

- list of list. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as a single date column.

- dict, e.g. {'foo' : [1, 3]} -> parse columns 1, 3 as date and call result ‘foo’

In [18]:
df = pd.read_csv("Dates_time.csv");
df

Unnamed: 0,month_day_year,day_month_year,date_time,year_month_day,Time
0,4/22/1996,22-Apr-96,Tue Aug 11 09:50:35 1996,6/22/2007,9:44:30
1,4/23/1996,23-Apr-96,Tue May 12 19:50:35 2016,1/9/2017,10:44:30
2,5/14/1996,14-May-96,Mon Oct 14 09:50:35 2017,4/12/1998,11:44:30
3,5/15/1996,15-May-96,Tue Jan 11 09:50:35 2018,7/22/2027,12:44:30
4,5/16/2001,16-May-01,Fri Mar 11 07:30:36 2019,11/15/1945,13:44:30
5,5/17/2002,17-May-02,Tue Aug 11 09:50:35 2020,6/22/1942,14:44:30
6,5/18/2003,18-May-03,Wed Dec 21 09:50:35 2021,1887-06-13,15:44:30
7,5/19/2004,19-May-04,Tue Jan 11 09:50:35 2022,1/25/1912,16:44:30
8,5/20/2005,20-May-05,Sun Jul 10 19:40:25 2023,6/22/2007,17:44:30


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   month_day_year  9 non-null      object
 1   day_month_year  9 non-null      object
 2   date_time       9 non-null      object
 3   year_month_day  9 non-null      object
 4   Time            9 non-null      object
dtypes: object(5)
memory usage: 488.0+ bytes


In [20]:
df = pd.read_csv("Dates_time.csv", parse_dates=True)
df

Unnamed: 0,month_day_year,day_month_year,date_time,year_month_day,Time
0,4/22/1996,22-Apr-96,Tue Aug 11 09:50:35 1996,6/22/2007,9:44:30
1,4/23/1996,23-Apr-96,Tue May 12 19:50:35 2016,1/9/2017,10:44:30
2,5/14/1996,14-May-96,Mon Oct 14 09:50:35 2017,4/12/1998,11:44:30
3,5/15/1996,15-May-96,Tue Jan 11 09:50:35 2018,7/22/2027,12:44:30
4,5/16/2001,16-May-01,Fri Mar 11 07:30:36 2019,11/15/1945,13:44:30
5,5/17/2002,17-May-02,Tue Aug 11 09:50:35 2020,6/22/1942,14:44:30
6,5/18/2003,18-May-03,Wed Dec 21 09:50:35 2021,1887-06-13,15:44:30
7,5/19/2004,19-May-04,Tue Jan 11 09:50:35 2022,1/25/1912,16:44:30
8,5/20/2005,20-May-05,Sun Jul 10 19:40:25 2023,6/22/2007,17:44:30


In [21]:
df.dtypes

month_day_year    object
day_month_year    object
date_time         object
year_month_day    object
Time              object
dtype: object

In [22]:
df = pd.read_csv("Dates_time.csv", parse_dates=[0,1,2,3]);
df

Unnamed: 0,month_day_year,day_month_year,date_time,year_month_day,Time
0,1996-04-22,1996-04-22,1996-08-11 09:50:35,2007-06-22,9:44:30
1,1996-04-23,1996-04-23,2016-05-12 19:50:35,2017-01-09,10:44:30
2,1996-05-14,1996-05-14,2017-10-14 09:50:35,1998-04-12,11:44:30
3,1996-05-15,1996-05-15,2018-01-11 09:50:35,2027-07-22,12:44:30
4,2001-05-16,2001-05-16,2019-03-11 07:30:36,1945-11-15,13:44:30
5,2002-05-17,2002-05-17,2020-08-11 09:50:35,1942-06-22,14:44:30
6,2003-05-18,2003-05-18,2021-12-21 09:50:35,1887-06-13,15:44:30
7,2004-05-19,2004-05-19,2022-01-11 09:50:35,1912-01-25,16:44:30
8,2005-05-20,2005-05-20,2023-07-10 19:40:25,2007-06-22,17:44:30


In [23]:
df.dtypes

month_day_year    datetime64[ns]
day_month_year    datetime64[ns]
date_time         datetime64[ns]
year_month_day    datetime64[ns]
Time                      object
dtype: object

In [27]:
df = pd.read_csv('Dates_time.csv', parse_dates=[[3,4]]);
df

Unnamed: 0,year_month_day_Time,month_day_year,day_month_year,date_time
0,2007-06-22 09:44:30,4/22/1996,22-Apr-96,Tue Aug 11 09:50:35 1996
1,2017-01-09 10:44:30,4/23/1996,23-Apr-96,Tue May 12 19:50:35 2016
2,1998-04-12 11:44:30,5/14/1996,14-May-96,Mon Oct 14 09:50:35 2017
3,2027-07-22 12:44:30,5/15/1996,15-May-96,Tue Jan 11 09:50:35 2018
4,1945-11-15 13:44:30,5/16/2001,16-May-01,Fri Mar 11 07:30:36 2019
5,1942-06-22 14:44:30,5/17/2002,17-May-02,Tue Aug 11 09:50:35 2020
6,1887-06-13 15:44:30,5/18/2003,18-May-03,Wed Dec 21 09:50:35 2021
7,1912-01-25 16:44:30,5/19/2004,19-May-04,Tue Jan 11 09:50:35 2022
8,2007-06-22 17:44:30,5/20/2005,20-May-05,Sun Jul 10 19:40:25 2023


In [28]:
df.dtypes

year_month_day_Time    datetime64[ns]
month_day_year                 object
day_month_year                 object
date_time                      object
dtype: object

In [29]:
df = pd.read_csv('Dates_time.csv', parse_dates=['month_day_year', 'day_month_year', 'date_time']);
df

Unnamed: 0,month_day_year,day_month_year,date_time,year_month_day,Time
0,1996-04-22,1996-04-22,1996-08-11 09:50:35,6/22/2007,9:44:30
1,1996-04-23,1996-04-23,2016-05-12 19:50:35,1/9/2017,10:44:30
2,1996-05-14,1996-05-14,2017-10-14 09:50:35,4/12/1998,11:44:30
3,1996-05-15,1996-05-15,2018-01-11 09:50:35,7/22/2027,12:44:30
4,2001-05-16,2001-05-16,2019-03-11 07:30:36,11/15/1945,13:44:30
5,2002-05-17,2002-05-17,2020-08-11 09:50:35,6/22/1942,14:44:30
6,2003-05-18,2003-05-18,2021-12-21 09:50:35,1887-06-13,15:44:30
7,2004-05-19,2004-05-19,2022-01-11 09:50:35,1/25/1912,16:44:30
8,2005-05-20,2005-05-20,2023-07-10 19:40:25,6/22/2007,17:44:30


In [30]:
df.dtypes

month_day_year    datetime64[ns]
day_month_year    datetime64[ns]
date_time         datetime64[ns]
year_month_day            object
Time                      object
dtype: object

In [31]:
df = pd.read_csv("Dates_time.csv", parse_dates={"MDYT":[0,4]});
df

Unnamed: 0,MDYT,day_month_year,date_time,year_month_day
0,1996-04-22 09:44:30,22-Apr-96,Tue Aug 11 09:50:35 1996,6/22/2007
1,1996-04-23 10:44:30,23-Apr-96,Tue May 12 19:50:35 2016,1/9/2017
2,1996-05-14 11:44:30,14-May-96,Mon Oct 14 09:50:35 2017,4/12/1998
3,1996-05-15 12:44:30,15-May-96,Tue Jan 11 09:50:35 2018,7/22/2027
4,2001-05-16 13:44:30,16-May-01,Fri Mar 11 07:30:36 2019,11/15/1945
5,2002-05-17 14:44:30,17-May-02,Tue Aug 11 09:50:35 2020,6/22/1942
6,2003-05-18 15:44:30,18-May-03,Wed Dec 21 09:50:35 2021,1887-06-13
7,2004-05-19 16:44:30,19-May-04,Tue Jan 11 09:50:35 2022,1/25/1912
8,2005-05-20 17:44:30,20-May-05,Sun Jul 10 19:40:25 2023,6/22/2007


In [33]:
df.dtypes

MDYT              datetime64[ns]
day_month_year            object
date_time                 object
year_month_day            object
dtype: object

In [41]:
df['MDYT'].dt.date

0    1996-04-22
1    1996-04-23
2    1996-05-14
3    1996-05-15
4    2001-05-16
5    2002-05-17
6    2003-05-18
7    2004-05-19
8    2005-05-20
Name: MDYT, dtype: object

In [42]:
df['MDYT'].dt.time

0    09:44:30
1    10:44:30
2    11:44:30
3    12:44:30
4    13:44:30
5    14:44:30
6    15:44:30
7    16:44:30
8    17:44:30
Name: MDYT, dtype: object

### infer_datetime_format
- bool, default False
- used to control whether Pandas should attempt to automatically infer the datetime format of date and time columns while reading a CSV file. 

In [47]:
df = pd.read_csv("Dates_time.csv", parse_dates=[0,1,2,3,4], infer_datetime_format=True);  
df                                                                                       

Unnamed: 0,month_day_year,day_month_year,date_time,year_month_day,Time
0,1996-04-22,1996-04-22,1996-08-11 09:50:35,2007-06-22,2023-10-12 09:44:30
1,1996-04-23,1996-04-23,2016-05-12 19:50:35,2017-01-09,2023-10-12 10:44:30
2,1996-05-14,1996-05-14,2017-10-14 09:50:35,1998-04-12,2023-10-12 11:44:30
3,1996-05-15,1996-05-15,2018-01-11 09:50:35,2027-07-22,2023-10-12 12:44:30
4,2001-05-16,2001-05-16,2019-03-11 07:30:36,1945-11-15,2023-10-12 13:44:30
5,2002-05-17,2002-05-17,2020-08-11 09:50:35,1942-06-22,2023-10-12 14:44:30
6,2003-05-18,2003-05-18,2021-12-21 09:50:35,1887-06-13,2023-10-12 15:44:30
7,2004-05-19,2004-05-19,2022-01-11 09:50:35,1912-01-25,2023-10-12 16:44:30
8,2005-05-20,2005-05-20,2023-07-10 19:40:25,2007-06-22,2023-10-12 17:44:30


In [45]:
pd.read_csv("Dates_time.csv", parse_dates=[0,1,2,3,4])

Unnamed: 0,month_day_year,day_month_year,date_time,year_month_day,Time
0,1996-04-22,1996-04-22,1996-08-11 09:50:35,2007-06-22,2023-10-12 09:44:30
1,1996-04-23,1996-04-23,2016-05-12 19:50:35,2017-01-09,2023-10-12 10:44:30
2,1996-05-14,1996-05-14,2017-10-14 09:50:35,1998-04-12,2023-10-12 11:44:30
3,1996-05-15,1996-05-15,2018-01-11 09:50:35,2027-07-22,2023-10-12 12:44:30
4,2001-05-16,2001-05-16,2019-03-11 07:30:36,1945-11-15,2023-10-12 13:44:30
5,2002-05-17,2002-05-17,2020-08-11 09:50:35,1942-06-22,2023-10-12 14:44:30
6,2003-05-18,2003-05-18,2021-12-21 09:50:35,1887-06-13,2023-10-12 15:44:30
7,2004-05-19,2004-05-19,2022-01-11 09:50:35,1912-01-25,2023-10-12 16:44:30
8,2005-05-20,2005-05-20,2023-07-10 19:40:25,2007-06-22,2023-10-12 17:44:30


### keep_date_col
- bool, default False
- If True and parse_dates specifies combining multiple columns, then keep the original columns.

In [51]:
df = pd.read_csv("Dates_time.csv", parse_dates=[[0,4],0,1,2,3,4], infer_datetime_format=True, keep_date_col=True);
df

Unnamed: 0,month_day_year_Time,month_day_year,day_month_year,date_time,year_month_day,Time
0,1996-04-22 09:44:30,1996-04-22,1996-04-22,1996-08-11 09:50:35,2007-06-22,2023-10-12 09:44:30
1,1996-04-23 10:44:30,1996-04-23,1996-04-23,2016-05-12 19:50:35,2017-01-09,2023-10-12 10:44:30
2,1996-05-14 11:44:30,1996-05-14,1996-05-14,2017-10-14 09:50:35,1998-04-12,2023-10-12 11:44:30
3,1996-05-15 12:44:30,1996-05-15,1996-05-15,2018-01-11 09:50:35,2027-07-22,2023-10-12 12:44:30
4,2001-05-16 13:44:30,2001-05-16,2001-05-16,2019-03-11 07:30:36,1945-11-15,2023-10-12 13:44:30
5,2002-05-17 14:44:30,2002-05-17,2002-05-17,2020-08-11 09:50:35,1942-06-22,2023-10-12 14:44:30
6,2003-05-18 15:44:30,2003-05-18,2003-05-18,2021-12-21 09:50:35,1887-06-13,2023-10-12 15:44:30
7,2004-05-19 16:44:30,2004-05-19,2004-05-19,2022-01-11 09:50:35,1912-01-25,2023-10-12 16:44:30
8,2005-05-20 17:44:30,2005-05-20,2005-05-20,2023-07-10 19:40:25,2007-06-22,2023-10-12 17:44:30


In [52]:
df.dtypes

month_day_year_Time    datetime64[ns]
month_day_year         datetime64[ns]
day_month_year         datetime64[ns]
date_time              datetime64[ns]
year_month_day         datetime64[ns]
Time                   datetime64[ns]
dtype: object

In [54]:
df = pd.read_csv("Dates_time.csv", parse_dates={'DMYT':[1,4]}, keep_date_col=True);
df

Unnamed: 0,DMYT,month_day_year,day_month_year,date_time,year_month_day,Time
0,1996-04-22 09:44:30,4/22/1996,22-Apr-96,Tue Aug 11 09:50:35 1996,6/22/2007,9:44:30
1,1996-04-23 10:44:30,4/23/1996,23-Apr-96,Tue May 12 19:50:35 2016,1/9/2017,10:44:30
2,1996-05-14 11:44:30,5/14/1996,14-May-96,Mon Oct 14 09:50:35 2017,4/12/1998,11:44:30
3,1996-05-15 12:44:30,5/15/1996,15-May-96,Tue Jan 11 09:50:35 2018,7/22/2027,12:44:30
4,2001-05-16 13:44:30,5/16/2001,16-May-01,Fri Mar 11 07:30:36 2019,11/15/1945,13:44:30
5,2002-05-17 14:44:30,5/17/2002,17-May-02,Tue Aug 11 09:50:35 2020,6/22/1942,14:44:30
6,2003-05-18 15:44:30,5/18/2003,18-May-03,Wed Dec 21 09:50:35 2021,1887-06-13,15:44:30
7,2004-05-19 16:44:30,5/19/2004,19-May-04,Tue Jan 11 09:50:35 2022,1/25/1912,16:44:30
8,2005-05-20 17:44:30,5/20/2005,20-May-05,Sun Jul 10 19:40:25 2023,6/22/2007,17:44:30


In [55]:
df.dtypes

DMYT              datetime64[ns]
month_day_year            object
day_month_year            object
date_time                 object
year_month_day            object
Time                      object
dtype: object

In [64]:
df = pd.read_csv("Dates_time.csv", parse_dates={"DMYT":[1,4]}, keep_date_col=True);
df

Unnamed: 0,DMYT,month_day_year,day_month_year,date_time,year_month_day,Time
0,1996-04-22 09:44:30,4/22/1996,22-Apr-96,Tue Aug 11 09:50:35 1996,6/22/2007,9:44:30
1,1996-04-23 10:44:30,4/23/1996,23-Apr-96,Tue May 12 19:50:35 2016,1/9/2017,10:44:30
2,1996-05-14 11:44:30,5/14/1996,14-May-96,Mon Oct 14 09:50:35 2017,4/12/1998,11:44:30
3,1996-05-15 12:44:30,5/15/1996,15-May-96,Tue Jan 11 09:50:35 2018,7/22/2027,12:44:30
4,2001-05-16 13:44:30,5/16/2001,16-May-01,Fri Mar 11 07:30:36 2019,11/15/1945,13:44:30
5,2002-05-17 14:44:30,5/17/2002,17-May-02,Tue Aug 11 09:50:35 2020,6/22/1942,14:44:30
6,2003-05-18 15:44:30,5/18/2003,18-May-03,Wed Dec 21 09:50:35 2021,1887-06-13,15:44:30
7,2004-05-19 16:44:30,5/19/2004,19-May-04,Tue Jan 11 09:50:35 2022,1/25/1912,16:44:30
8,2005-05-20 17:44:30,5/20/2005,20-May-05,Sun Jul 10 19:40:25 2023,6/22/2007,17:44:30


### date_parser
- Callable, optional

In [31]:
df = pd.read_csv('Dates_time.csv', parse_dates=[0,1,2,3,4]);
df

Unnamed: 0,month_day_year,day_month_year,date_time,year_month_day,Time
0,1996-04-22,1996-04-22,1996-08-11 09:50:35,2007-06-22,2023-10-13 09:44:30
1,1996-04-23,1996-04-23,2016-05-12 19:50:35,2017-01-09,2023-10-13 10:44:30
2,1996-05-14,1996-05-14,2017-10-14 09:50:35,1998-04-12,2023-10-13 11:44:30
3,1996-05-15,1996-05-15,2018-01-11 09:50:35,2027-07-22,2023-10-13 12:44:30
4,2001-05-16,2001-05-16,2019-03-11 07:30:36,1945-11-15,2023-10-13 13:44:30
5,2002-05-17,2002-05-17,2020-08-11 09:50:35,1942-06-22,2023-10-13 14:44:30
6,2003-05-18,2003-05-18,2021-12-21 09:50:35,1887-06-13,2023-10-13 15:44:30
7,2004-05-19,2004-05-19,2022-01-11 09:50:35,1912-01-25,2023-10-13 16:44:30
8,2005-05-20,2005-05-20,2023-07-10 19:40:25,2007-06-22,2023-10-13 17:44:30


In [32]:
df.dtypes

month_day_year    datetime64[ns]
day_month_year    datetime64[ns]
date_time         datetime64[ns]
year_month_day    datetime64[ns]
Time              datetime64[ns]
dtype: object

In [20]:
# write your own parser to support a different date format, 
# for example, YYYY DD MM HH:MM:SS

from datetime import datetime

custom_date_parser = lambda x: datetime.strptime(x, "%m/%d/%Y")

df = pd.read_csv("Dates_time.csv", parse_dates=[0], date_parser=custom_date_parser);
df

Unnamed: 0,month_day_year,day_month_year,date_time,year_month_day,Time
0,1996-04-22,22-Apr-96,Tue Aug 11 09:50:35 1996,6/22/2007,9:44:30
1,1996-04-23,23-Apr-96,Tue May 12 19:50:35 2016,1/9/2017,10:44:30
2,1996-05-14,14-May-96,Mon Oct 14 09:50:35 2017,4/12/1998,11:44:30
3,1996-05-15,15-May-96,Tue Jan 11 09:50:35 2018,7/22/2027,12:44:30
4,2001-05-16,16-May-01,Fri Mar 11 07:30:36 2019,11/15/1945,13:44:30
5,2002-05-17,17-May-02,Tue Aug 11 09:50:35 2020,6/22/1942,14:44:30
6,2003-05-18,18-May-03,Wed Dec 21 09:50:35 2021,1887-06-13,15:44:30
7,2004-05-19,19-May-04,Tue Jan 11 09:50:35 2022,1/25/1912,16:44:30
8,2005-05-20,20-May-05,Sun Jul 10 19:40:25 2023,6/22/2007,17:44:30


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   month_day_year  9 non-null      datetime64[ns]
 1   day_month_year  9 non-null      object        
 2   date_time       9 non-null      object        
 3   year_month_day  9 non-null      object        
 4   Time            9 non-null      object        
dtypes: datetime64[ns](1), object(4)
memory usage: 488.0+ bytes


### date_format
- str or dict of column -> format, optional
- New in version 2.0.0.

In [26]:
# df = pd.read_csv("Dates_time.csv", parse_dates=[0], date_format="%d-%m-%Y");
# df

### dayfirst
- bool, default False
- DD/MM format dates, international and European format.

In [28]:
df = pd.read_csv("Dates_time.csv", parse_dates=[0], dayfirst=True);
df

  df = pd.read_csv("Dates_time.csv", parse_dates=[0], dayfirst=True);


Unnamed: 0,month_day_year,day_month_year,date_time,year_month_day,Time
0,1996-04-22,22-Apr-96,Tue Aug 11 09:50:35 1996,6/22/2007,9:44:30
1,1996-04-23,23-Apr-96,Tue May 12 19:50:35 2016,1/9/2017,10:44:30
2,1996-05-14,14-May-96,Mon Oct 14 09:50:35 2017,4/12/1998,11:44:30
3,1996-05-15,15-May-96,Tue Jan 11 09:50:35 2018,7/22/2027,12:44:30
4,2001-05-16,16-May-01,Fri Mar 11 07:30:36 2019,11/15/1945,13:44:30
5,2002-05-17,17-May-02,Tue Aug 11 09:50:35 2020,6/22/1942,14:44:30
6,2003-05-18,18-May-03,Wed Dec 21 09:50:35 2021,1887-06-13,15:44:30
7,2004-05-19,19-May-04,Tue Jan 11 09:50:35 2022,1/25/1912,16:44:30
8,2005-05-20,20-May-05,Sun Jul 10 19:40:25 2023,6/22/2007,17:44:30


In [30]:
custom_date_parser = lambda date: pd.to_datetime(date, dayfirst=True);

df = pd.read_csv("Dates_time.csv", parse_dates=[0], date_parser=custom_date_parser);
df

  custom_date_parser = lambda date: pd.to_datetime(date, dayfirst=True);


Unnamed: 0,month_day_year,day_month_year,date_time,year_month_day,Time
0,1996-04-22,22-Apr-96,Tue Aug 11 09:50:35 1996,6/22/2007,9:44:30
1,1996-04-23,23-Apr-96,Tue May 12 19:50:35 2016,1/9/2017,10:44:30
2,1996-05-14,14-May-96,Mon Oct 14 09:50:35 2017,4/12/1998,11:44:30
3,1996-05-15,15-May-96,Tue Jan 11 09:50:35 2018,7/22/2027,12:44:30
4,2001-05-16,16-May-01,Fri Mar 11 07:30:36 2019,11/15/1945,13:44:30
5,2002-05-17,17-May-02,Tue Aug 11 09:50:35 2020,6/22/1942,14:44:30
6,2003-05-18,18-May-03,Wed Dec 21 09:50:35 2021,1887-06-13,15:44:30
7,2004-05-19,19-May-04,Tue Jan 11 09:50:35 2022,1/25/1912,16:44:30
8,2005-05-20,20-May-05,Sun Jul 10 19:40:25 2023,6/22/2007,17:44:30


In [36]:
custom_date_parser = lambda x: datetime.strptime(x, "%m/%d/%Y")

df = pd.read_csv("Dates_time.csv", parse_dates=[0], date_parser=custom_date_parser, dayfirst=True);
df

Unnamed: 0,month_day_year,day_month_year,date_time,year_month_day,Time
0,1996-04-22,22-Apr-96,Tue Aug 11 09:50:35 1996,6/22/2007,9:44:30
1,1996-04-23,23-Apr-96,Tue May 12 19:50:35 2016,1/9/2017,10:44:30
2,1996-05-14,14-May-96,Mon Oct 14 09:50:35 2017,4/12/1998,11:44:30
3,1996-05-15,15-May-96,Tue Jan 11 09:50:35 2018,7/22/2027,12:44:30
4,2001-05-16,16-May-01,Fri Mar 11 07:30:36 2019,11/15/1945,13:44:30
5,2002-05-17,17-May-02,Tue Aug 11 09:50:35 2020,6/22/1942,14:44:30
6,2003-05-18,18-May-03,Wed Dec 21 09:50:35 2021,1887-06-13,15:44:30
7,2004-05-19,19-May-04,Tue Jan 11 09:50:35 2022,1/25/1912,16:44:30
8,2005-05-20,20-May-05,Sun Jul 10 19:40:25 2023,6/22/2007,17:44:30


In [39]:
custom_date_parser = lambda x: datetime.strptime(x, "%m/%d/%Y")

df = pd.read_csv("Dates_time.csv", parse_dates=[3], date_parser=custom_date_parser, dayfirst=True, infer_datetime_format=True);
df

Unnamed: 0,month_day_year,day_month_year,date_time,year_month_day,Time
0,4/22/1996,22-Apr-96,Tue Aug 11 09:50:35 1996,2007-06-22,9:44:30
1,4/23/1996,23-Apr-96,Tue May 12 19:50:35 2016,2017-01-09,10:44:30
2,5/14/1996,14-May-96,Mon Oct 14 09:50:35 2017,1998-04-12,11:44:30
3,5/15/1996,15-May-96,Tue Jan 11 09:50:35 2018,2027-07-22,12:44:30
4,5/16/2001,16-May-01,Fri Mar 11 07:30:36 2019,1945-11-15,13:44:30
5,5/17/2002,17-May-02,Tue Aug 11 09:50:35 2020,1942-06-22,14:44:30
6,5/18/2003,18-May-03,Wed Dec 21 09:50:35 2021,1887-06-13,15:44:30
7,5/19/2004,19-May-04,Tue Jan 11 09:50:35 2022,1912-01-25,16:44:30
8,5/20/2005,20-May-05,Sun Jul 10 19:40:25 2023,2007-06-22,17:44:30


### iterator
- bool, default False
- Return TextFileReader object for iteration or getting chunks with get_chunk().

In [43]:
file = pd.read_csv("Employee.csv", iterator=True);
file

<pandas.io.parsers.readers.TextFileReader at 0x25c9728dfc0>

In [44]:
df = file.get_chunk()

In [45]:
df

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,Bachelors,2017,",",3,34,Male,No,0,0.0
1,Bachelors,2013,Pune,1,28,Female,,3,1.0
2,Null,2014,,3,38,,No,2,0.0
3,Masters,2016,Bangalore,3,27,Male,No,5,1.0
4,Masters,2017,Pune,3,24,,Yes,2,1.0
...,...,...,...,...,...,...,...,...,...
94,Bachelors,2014,Bangalore,3,39,Male,No,3,1.0
95,PHD,2018,Bangalore,3,26,Male,No,4,1.0
96,Bachelors,2016,Bangalore,3,31,Female,No,0,0.0
97,Bachelors,2014,Bangalore,3,27,Male,No,5,1.0


In [48]:
reader = pd.read_csv("Employee.csv", iterator=True);

df = reader.get_chunk(20);
df

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,Bachelors,2017,",",3,34,Male,No,0,0.0
1,Bachelors,2013,Pune,1,28,Female,,3,1.0
2,Null,2014,,3,38,,No,2,0.0
3,Masters,2016,Bangalore,3,27,Male,No,5,1.0
4,Masters,2017,Pune,3,24,,Yes,2,1.0
5,Bachelors,2016,Bangalore,3,22,Male,No,0,
6,Bachelors,2015,,3,38,Male,No,0,0.0
7,,2016,Bangalore,3,34,Female,No,2,1.0
8,Bachelors,2016,Pune,3,23,,No,1,0.0
9,Masters,2017,New Delhi,2,37,Male,No,2,0.0


### chunksize
- int, optional
- Number of lines to read from the file per chunk.

In [49]:
file = pd.read_csv('Employee.csv', chunksize=20);
file

<pandas.io.parsers.readers.TextFileReader at 0x25c9728dcf0>

In [50]:
df = file.get_chunk();

In [51]:
df

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,Bachelors,2017,",",3,34,Male,No,0,0.0
1,Bachelors,2013,Pune,1,28,Female,,3,1.0
2,Null,2014,,3,38,,No,2,0.0
3,Masters,2016,Bangalore,3,27,Male,No,5,1.0
4,Masters,2017,Pune,3,24,,Yes,2,1.0
5,Bachelors,2016,Bangalore,3,22,Male,No,0,
6,Bachelors,2015,,3,38,Male,No,0,0.0
7,,2016,Bangalore,3,34,Female,No,2,1.0
8,Bachelors,2016,Pune,3,23,,No,1,0.0
9,Masters,2017,New Delhi,2,37,Male,No,2,0.0


In [52]:
df = file.get_chunk()
df

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
20,Bachelors,2012,Bangalore,3,37,Male,No,0,0
21,Masters,2017,New Delhi,2,28,Male,No,4,0
22,Bachelors,2017,New Delhi,2,36,Male,No,3,0
23,Bachelors,2015,Bangalore,3,27,Male,Yes,5,0
24,Bachelors,2017,Bangalore,3,29,Male,No,4,0
25,Bachelors,2013,Bangalore,3,22,Female,Yes,0,0
26,Bachelors,2016,Bangalore,3,37,Male,No,2,0
27,Bachelors,2015,Bangalore,3,23,Male,No,1,0
28,Bachelors,2013,Pune,2,31,Female,No,2,1
29,Masters,2017,New Delhi,2,30,Female,No,2,0


In [53]:
df = file.get_chunk()
df

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
40,Bachelors,2015,Bangalore,3,36,Male,No,1,0
41,Masters,2017,New Delhi,2,23,Male,No,1,0
42,Bachelors,2013,Bangalore,3,30,Male,No,3,1
43,Bachelors,2014,Bangalore,3,36,Male,No,0,0
44,Bachelors,2015,Pune,3,39,Male,No,4,0
45,Bachelors,2014,Bangalore,3,23,Female,No,1,0
46,PHD,2012,New Delhi,3,27,Male,No,5,0
47,Masters,2013,New Delhi,3,35,Male,No,2,0
48,Masters,2017,New Delhi,2,34,Male,No,0,0
49,Bachelors,2013,Bangalore,3,30,Female,No,1,0


### thousands
- str (length 1), optional
- Character acting as the thousands separator in numerical values.

In [18]:
df = pd.read_csv("Numbers.txt", parse_dates=[2]);
df

Unnamed: 0,Name,Age,hire_date,Salary
0,Akram,22,2020-04-22,50-000-00
1,Usman,20,2004-05-23,2-340-000-0
2,Sajid,21,2006-08-15,34-000-000-0
3,Kaleem,23,2010-01-29,23-000-000-0
4,Mudassir,19,2019-12-22,2-000-000


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Name       5 non-null      object        
 1   Age        5 non-null      int64         
 2   hire_date  5 non-null      datetime64[ns]
 3   Salary     5 non-null      object        
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 288.0+ bytes


In [20]:
df=pd.read_csv("Numbers.txt", parse_dates=[2], thousands='-');
df

Unnamed: 0,Name,Age,hire_date,Salary
0,Akram,22,2020-04-22,5000000
1,Usman,20,2004-05-23,23400000
2,Sajid,21,2006-08-15,340000000
3,Kaleem,23,2010-01-29,230000000
4,Mudassir,19,2019-12-22,2000000


In [21]:
df.dtypes

Name                 object
Age                   int64
hire_date    datetime64[ns]
Salary                int64
dtype: object

### decimal
- str (length 1), default ‘.’
- Character to recognize as decimal point (e.g., use ‘,’ for European data).

In [22]:
df = pd.read_csv("Numbers.txt", parse_dates=[2], decimal='\'');
df

Unnamed: 0,Name,Age,hire_date,Salary,Bonus
0,Akram,22,2020-04-22,50-000-00,2.3
1,Usman,20,2004-05-23,2-340-000-0,5.1
2,Sajid,21,2006-08-15,34-000-000-0,6.1
3,Kaleem,23,2010-01-29,23-000-000-0,2.5
4,Mudassir,19,2019-12-22,2-000-000,6.0


In [25]:
df = pd.read_csv("Numbers.txt", parse_dates=[2], decimal='\'', thousands='-');
df

Unnamed: 0,Name,Age,hire_date,Salary,Bonus
0,Akram,22,2020-04-22,5000000,2.3
1,Usman,20,2004-05-23,23400000,5.1
2,Sajid,21,2006-08-15,340000000,6.1
3,Kaleem,23,2010-01-29,230000000,2.5
4,Mudassir,19,2019-12-22,2000000,6.0


### lineterminator
- str (length 1), optional
- Character used to denote a line break. Only valid with C parser.

In [26]:
from io import StringIO

In [37]:
data = """Name,age,gpa! Abbas,20,3.0! Kausar,22,3.5! Usman,21,3.6""";

df = pd.read_csv(StringIO(data), lineterminator='!', engine='c');
df

Unnamed: 0,Name,age,gpa
0,Abbas,20,3.0
1,Kausar,22,3.5
2,Usman,21,3.6


### quotechar
- str (length 1), optional
- Character used to denote the start and end of a quoted item. Quoted items can include the delimiter and it will be ignored.

In [63]:
data = """Name,age,gpa!/Muhammad Abbas/,20,3.0! Kausar,22,3.5! Usman,21,3.6""";

df = pd.read_csv(StringIO(data), lineterminator='!', quotechar='/');
df

Unnamed: 0,Name,age,gpa
0,Muhammad Abbas,20,3.0
1,Kausar,22,3.5
2,Usman,21,3.6


### quoting{0 or csv.QUOTE_MINIMAL, 1 or csv.QUOTE_ALL, 2 or csv.QUOTE_NONNUMERIC, 3 or csv.QUOTE_NONE}, default csv.QUOTE_MINIMAL

In [65]:
import csv

In [64]:
data = """Name,age,gpa!/Muhammad Abbas/,20,3.0! /Kausar/,22,3.5! Usman,21,3.6""";

df = pd.read_csv(StringIO(data), lineterminator='!', quotechar='/');
df

Unnamed: 0,Name,age,gpa
0,Muhammad Abbas,20,3.0
1,/Kausar/,22,3.5
2,Usman,21,3.6


In [69]:
data = """Name,age,gpa!"Muhammad Abbas",20,3.0! "Kausar",22,3.5! Usman,21,3.6""";

df = pd.read_csv(StringIO(data), lineterminator='!', quotechar='"', quoting=csv.QUOTE_ALL);
df

Unnamed: 0,Name,age,gpa
0,Muhammad Abbas,20,3.0
1,"""Kausar""",22,3.5
2,Usman,21,3.6


In [70]:
data = """Name,age,gpa!"Muhammad Abbas",20,3.0! "Kausar",22,3.5! Usman,21,3.6""";

df = pd.read_csv(StringIO(data), lineterminator='!', quotechar='"', quoting=csv.QUOTE_NONNUMERIC);
df

Unnamed: 0,Name,age,gpa
0,Muhammad Abbas,20.0,3.0
1,"""Kausar""",22.0,3.5
2,Usman,21.0,3.6


In [71]:
data = """Name,age,gpa!"Muhammad Abbas",20,3.0! "Kausar",22,3.5! Usman,21,3.6""";

df = pd.read_csv(StringIO(data), lineterminator='!', quotechar='"', quoting=csv.QUOTE_NONE);
df

Unnamed: 0,Name,age,gpa
0,"""Muhammad Abbas""",20,3.0
1,"""Kausar""",22,3.5
2,Usman,21,3.6


### doublequote
- bool, default True
- When quotechar is specified and quoting is not QUOTE_NONE, indicate whether or not to interpret two consecutive quotechar elements INSIDE a field as a single quotechar element.

In [79]:
data = 'Name,age,gpa!"Muhammad Abbas""khan""",20,3.0! "Kausar",22,3.5! Usman,21,3.6';

df = pd.read_csv(StringIO(data), lineterminator='!', quotechar='"', quoting=csv.QUOTE_ALL, doublequote=True);
df

Unnamed: 0,Name,age,gpa
0,"Muhammad Abbas""khan""",20,3.0
1,"""Kausar""",22,3.5
2,Usman,21,3.6


In [80]:
data = 'Name,age,gpa!"Muhammad Abbas""khan""",20,3.0! "Kausar",22,3.5! Usman,21,3.6';

df = pd.read_csv(StringIO(data), lineterminator='!', quotechar='"', quoting=csv.QUOTE_ALL, doublequote=False);
df

Unnamed: 0,Name,age,gpa
0,"Muhammad Abbas""khan""""""",20,3.0
1,"""Kausar""",22,3.5
2,Usman,21,3.6


### escapechar 
- str (length 1), optional
- Character used to escape other characters.

In [100]:
data = 'Name,age,gpa!"Muhammad|" Abbas",20,3.0! Kausar,22,3.5! Usman,21,3.6';

df = pd.read_csv(StringIO(data), lineterminator='!', quoting=csv.QUOTE_ALL, quotechar='"', escapechar='|');
df

Unnamed: 0,Name,age,gpa
0,"Muhammad"" Abbas",20,3.0
1,Kausar,22,3.5
2,Usman,21,3.6


### comment
- str (length 1), optional

In [107]:
data = 'Name,age,gpa!"Muhammad|" Abbas|"",20,3.0! Kausar,22,3.5 #Usman,21,3.6';

df = pd.read_csv(StringIO(data), lineterminator='!', quoting=csv.QUOTE_ALL, quotechar='"', escapechar='|', skipinitialspace=True, comment='#');
df

Unnamed: 0,Name,age,gpa
0,"Muhammad"" Abbas""",20,3.0
1,Kausar,22,3.5
