In [2]:
import pandas as pd

my_dict = { 'name' : ["a", "b", "c", "d", "e","f", "g"],
                   'age' : [20,27, 35, 55, 18, 21, 35],
                   'designation': ["VP", "CEO", "CFO", "VP", "VP", "CEO", "MD"]}
df = pd.DataFrame(my_dict)

In [3]:
df

Unnamed: 0,name,age,designation
0,a,20,VP
1,b,27,CEO
2,c,35,CFO
3,d,55,VP
4,e,18,VP
5,f,21,CEO
6,g,35,MD


# Persisting the DataFrame into a CSV file

In [5]:
df.to_csv('data/csv_example.csv')
# see new file appear!

In [8]:
# Let’s go ahead and load the CSV file and create a new DataFrame out of it
df_csv = pd.read_csv('data/csv_example.csv')
df_csv # notice how the index (row names) generated twice

Unnamed: 0.1,Unnamed: 0,name,age,designation
0,0,a,20,VP
1,1,b,27,CEO
2,2,c,35,CFO
3,3,d,55,VP
4,4,e,18,VP
5,5,f,21,CEO
6,6,g,35,MD


In [9]:
df.to_csv('data/csv_example.csv', index=False)
# adding the index=False prevents this as DataFrame automatically adds the index 

In [10]:
df_csv = pd.read_csv('data/csv_example.csv')
df_csv # all better

Unnamed: 0,name,age,designation
0,a,20,VP
1,b,27,CEO
2,c,35,CFO
3,d,55,VP
4,e,18,VP
5,f,21,CEO
6,g,35,MD


# Playing with Column Header

As we’ve seen that the first row is always considered as column headers, however, it’s possible to have more than one row as column headers by specifying a parameter called header=<integer> in read_csv(...) function.
    
By default, the value is specified as ‘0’, which means that the top row will be considered as header.

In [11]:
df_csv = pd.read_csv('data/csv_example.csv', header = 0)
df_csv

Unnamed: 0,name,age,designation
0,a,20,VP
1,b,27,CEO
2,c,35,CFO
3,d,55,VP
4,e,18,VP
5,f,21,CEO
6,g,35,MD


In [12]:
df_csv = pd.read_csv('data/csv_example.csv', header=[0,1,2])
df_csv

Unnamed: 0_level_0,name,age,designation
Unnamed: 0_level_1,a,20,VP
Unnamed: 0_level_2,b,27,CEO
0,c,35,CFO
1,d,55,VP
2,e,18,VP
3,f,21,CEO
4,g,35,MD


In [13]:
df_csv = pd.read_csv('data/csv_example.csv', header=5)
df_csv

Unnamed: 0,e,18,VP
0,f,21,CEO
1,g,35,MD


In [40]:
df_csv = pd.read_csv('csv_example', header=[1,2,5])
df_csv # even though the headers skip rows, the 'data' doesn't begin after the header is done i.e. row 6/f and the 3 & 4 rows are omitted

Unnamed: 0_level_0,a,20,VP
Unnamed: 0_level_1,b,27,CEO
Unnamed: 0_level_2,e,18,VP
0,f,21,CEO
1,g,35,MD


# Customizing Column Names


In [14]:
df_csv = pd.read_csv('data/csv_example.csv', names=['a', 'b', 'c'])
df_csv

Unnamed: 0,a,b,c
0,name,age,designation
1,a,20,VP
2,b,27,CEO
3,c,35,CFO
4,d,55,VP
5,e,18,VP
6,f,21,CEO
7,g,35,MD


However, even though we are successful in adding our own header, the top row still displays header which is a non desired one.

This can be avoided by using the header parameter in read_csv(…)to skip the row depicting the header. In this particular case, we know that first row, i.e. row 0 is header so we can skip it as

In [42]:
df_csv = pd.read_csv('data/csv_example.csv', names=['a', 'b', 'c'], header=1)
df_csv

Unnamed: 0,a,b,c
0,b,27,CEO
1,c,35,CFO
2,d,55,VP
3,e,18,VP
4,f,21,CEO
5,g,35,MD


In [43]:
df.to_csv('data/csv_example.csv', index=False, header = False)
df_csv # another way to skip the header row preset

Unnamed: 0,a,b,c
0,b,27,CEO
1,c,35,CFO
2,d,55,VP
3,e,18,VP
4,f,21,CEO
5,g,35,MD


In [44]:
df_csv = pd.read_csv('data/csv_example.csv', names=['NAME', 'AGE', 'DESIGNATION'])
df_csv

Unnamed: 0,NAME,AGE,DESIGNATION
0,a,20,VP
1,b,27,CEO
2,c,35,CFO
3,d,55,VP
4,e,18,VP
5,f,21,CEO
6,g,35,MD


# CSV to (Anything) Separated Value

## : COLON separated

In [45]:
df.to_csv('data/csv_example.csv', index=False, sep=":")

In [51]:
df_csv = pd.read_csv('data/csv_example.csv', sep=":")
df_csv

Unnamed: 0,name,age,designation
0,a,20,VP
1,b,27,CEO
2,c,35,CFO
3,d,55,VP
4,e,18,VP
5,f,21,CEO
6,g,35,MD


# Setting the Row Index


In [52]:
df_csv.set_index('name')
df_csv

Unnamed: 0,name,age,designation
0,a,20,VP
1,b,27,CEO
2,c,35,CFO
3,d,55,VP
4,e,18,VP
5,f,21,CEO
6,g,35,MD


In [53]:
df_csv = pd.read_csv('data/csv_example.csv', sep=":", index_col=1)
df_csv

Unnamed: 0_level_0,name,designation
age,Unnamed: 1_level_1,Unnamed: 2_level_1
20,a,VP
27,b,CEO
35,c,CFO
55,d,VP
18,e,VP
21,f,CEO
35,g,MD


In [54]:
df_csv = pd.read_csv('data/csv_example.csv', sep=":", index_col=[0,2])
df_csv # providing more than one row index naming acts the same way as columns

Unnamed: 0_level_0,Unnamed: 1_level_0,age
name,designation,Unnamed: 2_level_1
a,VP,20
b,CEO,27
c,CFO,35
d,VP,55
e,VP,18
f,CEO,21
g,MD,35


# If all rows are not required… Don’t load them

In [56]:
# Load Only 3 Rows
df_csv = pd.read_csv('data/csv_example.csv', sep=":", nrows=3)
df_csv

Unnamed: 0,name,age,designation
0,a,20,VP
1,b,27,CEO
2,c,35,CFO


# Skipping Empty Lines in CSV files


By default, ```read_csv(...)``` function skips blank line, i.e it will ignore blank lines while loading the file and constructing the DataFrame.

In [59]:
df_csv = pd.read_csv('data/csv_example.csv', skip_blank_lines=False, sep=":") # mark skipping as False and any blanks in data will remain
df_csv

Unnamed: 0,name,age,designation
0,a,20,VP
1,b,27,CEO
2,c,35,CFO
3,d,55,VP
4,e,18,VP
5,f,21,CEO
6,g,35,MD
