# Reading CSV files with parameters

In [5]:
# importing pandas

import pandas as pd
import numpy as np

## Importing data from csv file

Pandas library supports data reading from various type of files including excel, pickle, csv, etc.

In [25]:
df = pd.read_csv("mercedesbenz.csv")

In [26]:
# printing the first five rows using .head() function

df.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


#### info() function

Displays the dataset details

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4209 entries, 0 to 4208
Columns: 378 entries, ID to X385
dtypes: float64(1), int64(369), object(8)
memory usage: 12.1+ MB


#### describe() function

Returns the count, mean, standard deviation, min, percentiles (25%, 50%, 75%) etc. of the given dataset.

**Note**: **It will only take integer and float value while calculating the above values.** Remember, the categorical features will be skipped when this function is used.

### Reading data from csv file where comma replaced with semi-colon

In [23]:
# Creating a test.csv file and storing it in the curent location.

# creating the dataframe
test_df = pd.DataFrame(np.arange(1,21).reshape(4,5), index=['Row1', 'Row2','Row3','Row4'],columns=['Col1','Col2','Col3','Col4','Col5'])

# converting the dataframe into a csv file
test_df.to_csv("test.csv")

Opened the `test.csv` file created, and converted the commas into semi-colons and Saved the file

#### Reading the edited file replaced with semi-colons using `sep` parameter

In [14]:
test_df = pd.read_csv("test.csv", sep=";")

In [15]:
test_df

Unnamed: 0,Col1,Col2,Col3,Col4,Col5
Row1,1,2,3,4,5
Row2,6,7,8,9,10
Row3,11,12,13,14,15
Row4,16,17,18,19,20


#### Using value_counts() on larger dataset

In [28]:
# get the count of occurrences

df['X2'].value_counts()

X2
as    1659
ae     496
ai     415
m      367
ak     265
r      153
n      137
s       94
f       87
e       81
aq      63
ay      54
a       47
t       29
k       25
i       25
b       21
ao      20
ag      19
z       19
d       18
ac      13
g       12
ap      11
y       11
x       10
aw       8
at       6
h        6
al       5
an       5
q        5
av       4
ah       4
p        4
au       3
am       1
j        1
af       1
l        1
aa       1
c        1
o        1
ar       1
Name: count, dtype: int64

In [33]:
# displays only records where the y values is greater than 100

df[df['y']>100]

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
6,24,128.76,al,r,e,f,d,f,h,s,...,0,0,0,0,0,0,0,0,0,0
8,27,108.67,w,s,as,e,d,f,i,h,...,1,0,0,0,0,0,0,0,0,0
9,30,126.99,j,b,aq,c,d,f,a,e,...,0,0,1,0,0,0,0,0,0,0
10,31,102.09,h,r,r,f,d,f,h,p,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4202,8402,123.34,ap,l,s,c,d,aa,d,r,...,0,0,0,0,0,0,0,0,0,0
4204,8405,107.39,ak,s,as,c,d,aa,d,q,...,1,0,0,0,0,0,0,0,0,0
4205,8406,108.77,j,o,t,d,d,aa,h,h,...,0,1,0,0,0,0,0,0,0,0
4206,8412,109.22,ak,v,r,a,d,aa,g,e,...,0,0,1,0,0,0,0,0,0,0


## StringIO and BytesIO

In [1]:
# importing stringIO and BytesIO

from io import StringIO, BytesIO

In [2]:
# creating a dummy data
data = ('col1, col2, col3\n'
        '1,2,3\n'
        'a,b,c\n'
        'c,d,3\n')


In [3]:
type(data)

str

In [7]:
# Using StringIO to convert data to csv file

pd.read_csv(StringIO(data))

Unnamed: 0,col1,col2,col3
0,1,2,3
1,a,b,c
2,c,d,3


#### Specifically load data from desired columns

In [11]:
# We can also specifically load data from desired columns

pd.read_csv(StringIO(data),usecols=['col1'])

Unnamed: 0,col1
0,1
1,a
2,c


#### Specifying the datatype while loading

In [13]:
# Here its a object

pd.read_csv(StringIO(data),dtype=object)

Unnamed: 0,col1,col2,col3
0,1,2,3
1,a,b,c
2,c,d,3


#### Specify the datatype for each column while loading

In [21]:
df = pd.read_csv(StringIO(data), dtype={'col1' : object, 'col2':object,'col3': object})

In [22]:
df.dtypes

col1     object
 col2    object
 col3    object
dtype: object

#### Specify the index columns

By default, in the pandas dataframe the index column will be considered as 0, 1, 2, ... 

To avoid this and get it replaced by our own values, we need to specify the col name in the data.

In [32]:
data = ('index, col1,col2,col3\n'
        'row1,apple,ball,cat\n'
        'row2, donkey,carrot,onion\n')

In [33]:
# reading data 

pd.read_csv(StringIO(data))

Unnamed: 0,index,col1,col2,col3
0,row1,apple,ball,cat
1,row2,donkey,carrot,onion


Making index column as the "index" col instead of 0 and 1


In [35]:
# passing the required index value which needs to be considered as column

pd.read_csv(StringIO(data), index_col=0)

Unnamed: 0_level_0,col1,col2,col3
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
row1,apple,ball,cat
row2,donkey,carrot,onion


#### Using `index_col = False` parameter

In [39]:
data = ('col1,col2,col3\n'
        'row1,apple,ball,\n'
        'row2, donkey,carrot,\n')

In [40]:
pd.read_csv(StringIO(data))

Unnamed: 0,col1,col2,col3
row1,apple,ball,
row2,donkey,carrot,


To avoid providing `NaN` as output, we can provide the `index_col = False` as parameter

In [41]:
pd.read_csv(StringIO(data),index_col=False)

Unnamed: 0,col1,col2,col3
0,row1,apple,ball
1,row2,donkey,carrot


### Quoting and Escape Characters 

**Mostly useful in NLP**

In [42]:
data = 'a,b\n"hello, \\"Bob=b\\",nice to meet you",5'

In [43]:
pd.read_csv(StringIO(data), escapechar="\\")

Unnamed: 0,a,b
0,"hello, ""Bob=b"",nice to meet you",5
