In [67]:
# Reading different data sources with the help of pandas


# Let's understand StringIO concept:
    # a multi-line string data that mimics the contents of a CSV file. 
    # Each line represents a row in the CSV, with columns separated by commas. 
    # The \n denotes a newline, so the string represents a table with columns col1, col2, col3 and two rows of data.

    # The StringIO module from the io library in Python is used to treat a string as a file-like object. 
    # This is useful because pd.read_csv expects a file-like object as an argument, so we need to convert our string data into a file-like object.


In [65]:
import pandas as pd
from io import StringIO, BytesIO

In [13]:
data = ('col1,col2,col3\n'
            'x,y,1\n'
            'a,b,2\n'
            'c,d,3')
type(data)

str

In [15]:
# Converting a string to csv file using pandas.
# We only providing 3 values(eg: x, y, 1) for the 3 column,(eg: col1, clo2, col3) So pandas itself assign the index(0, 1,..).
df = pd.read_csv(StringIO(data))
df

Unnamed: 0,col1,col2,col3
0,x,y,1
1,a,b,2
2,c,d,3


In [23]:
# Read from specific columns
df = pd.read_csv(StringIO(data), usecols=["col1", "col3"])
df

Unnamed: 0,col1,col3
0,x,1
1,a,2
2,c,3


In [24]:
# Converting our DataFrame into a csv file
df.to_csv("eg2.csv")

In [39]:
data = ('a,b,c,d\n'
            '1,2,3,4\n'
            '5,6,7,8\n'
            '9,10,11')
print(data)

a,b,c,d
1,2,3,4
5,6,7,8
9,10,11


In [33]:
# Specifying columns data types
df = pd.read_csv(StringIO(data), dtype=float)
df

Unnamed: 0,a,b,c,d
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0
2,9.0,10.0,11.0,


In [43]:
# Specifying different data types for different columns
df=pd.read_csv(StringIO(data),dtype={'b':int,'c':float,'a':str})
df

Unnamed: 0,a,b,c,d
0,1,2,3.0,4.0
1,5,6,7.0,8.0
2,9,10,11.0,


In [48]:
df.dtypes

a     object
b      int64
c    float64
d    float64
dtype: object

In [49]:
# Index columns and training delimiters

In [55]:
data = ('index,a,b,c\n'
           '4,apple,bat,5.7\n'
            '8,orange,cow,10')
# Not specifying index values for each row
pd.read_csv(StringIO(data))

Unnamed: 0,index,a,b,c
0,4,apple,bat,5.7
1,8,orange,cow,10.0


In [57]:
# Specifying index values for each row
df = pd.read_csv(StringIO(data), index_col=0) #0 - 1st value of each line (eg: index, 4, 8). We can provide any value(0,1..)
df

Unnamed: 0_level_0,a,b,c
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,apple,bat,5.7
8,orange,cow,10.0


In [62]:
data = ('a,b,c\n'
        '4,apple,bat\n'
        '8,orange,cow')
# Not specifying index values for each row
pd.read_csv(StringIO(data), index_col=False)

Unnamed: 0,a,b,c
0,4,apple,bat
1,8,orange,cow


In [79]:
## Quoting and Escape Characters¶. Very useful in NLP

data = 'a,b\n"hello, \\"Bob\\", nice to see you",5'

pd.read_csv(StringIO(data),escapechar='\\')



Unnamed: 0,a,b
0,"hello, ""Bob"", nice to see you",5
