## Pandas read_csv() [importing file]

In [20]:
import pandas as pd

### 1. csv data from local space

In [21]:
df1 = pd.read_csv('StudentsPerformance.csv')

In [22]:
df1.sample(2)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
724,male,group B,some college,standard,none,47,43,41
562,male,group C,bachelor's degree,standard,completed,96,90,92


### 2. `sep` (Separator) parameter:

In [23]:
df2 = pd.read_csv('different_sep_data.csv') # read the csv file with ';' separator

In [26]:
df2.head(2) # combine in a single column:

Unnamed: 0,"gender;""race/ethnicity"";""parental level of education"";""lunch"";""test preparation course"";""math score"";""reading score"";""writing score"""
0,"female;""group B"";""bachelor's degree"";""standard..."
1,"female;""group C"";""some college"";""standard"";""co..."


In [27]:
df2.shape

(1000, 1)

In [28]:
# To solve this use sep parameter:

df2 = pd.read_csv('different_sep_data.csv',sep=';')

In [29]:
df2.head(2)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88


In [30]:
df2.shape

(1000, 8)

### 3. `Index_col` parameter:

In [32]:
# Using a column as index:

df3 = pd.read_csv('data_with_index.csv')
df3.head(2)

Unnamed: 0.1,Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,0,female,group B,bachelor's degree,standard,none,72,72,74
1,1,female,group C,some college,standard,completed,69,90,88


In [33]:
df3 = pd.read_csv('data_with_index.csv',index_col='Unnamed: 0')
df3.head(2)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88


### 4. `Header` parameter:

In [35]:
# If the column names are not in the topmost row.
 
df4 = pd.read_csv('header_col.csv')
df4.head(2)

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
1,female,group B,bachelor's degree,standard,none,72,72,74


In [36]:
# We need to adjust header parameter accordingly.

df4 = pd.read_csv('header_col.csv',header=1)
df4.head(2)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88


### 5. `usecols` parameter:

In [37]:
# Can define which column can be used.

df5 = pd.read_csv('StudentsPerformance.csv',usecols=["race/ethnicity","parental level of education","math score"])
df5.head(2)

Unnamed: 0,race/ethnicity,parental level of education,math score
0,group B,bachelor's degree,72
1,group C,some college,69


### 6. `nrows` parameter:

In [38]:
# Speacify number of rows you want to display

df6 = pd.read_csv('StudentsPerformance.csv',nrows=100)
df6.shape

(100, 8)

### 7. `encoding` parameter:

In [None]:
df7 = pd.read_csv('Different_encoding_data.csv') # generate error.

In [41]:
df7 = pd.read_csv('Different_encoding_data.csv', encoding='Latin-1')
df7.head(3)

Unnamed: 0,Restaurant ID,Restaurant Name,Country Code,City,Address,Locality,Locality Verbose,Longitude,Latitude,Cuisines,...,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Rating color,Rating text,Votes
0,6317637,Le Petit Souffle,162,Makati City,"Third Floor, Century City Mall, Kalayaan Avenu...","Century City Mall, Poblacion, Makati City","Century City Mall, Poblacion, Makati City, Mak...",121.027535,14.565443,"French, Japanese, Desserts",...,Botswana Pula(P),Yes,No,No,No,3,4.8,Dark Green,Excellent,314
1,6304287,Izakaya Kikufuji,162,Makati City,"Little Tokyo, 2277 Chino Roces Avenue, Legaspi...","Little Tokyo, Legaspi Village, Makati City","Little Tokyo, Legaspi Village, Makati City, Ma...",121.014101,14.553708,Japanese,...,Botswana Pula(P),Yes,No,No,No,3,4.5,Dark Green,Excellent,591
2,6300002,Heat - Edsa Shangri-La,162,Mandaluyong City,"Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...","Edsa Shangri-La, Ortigas, Mandaluyong City","Edsa Shangri-La, Ortigas, Mandaluyong City, Ma...",121.056831,14.581404,"Seafood, Asian, Filipino, Indian",...,Botswana Pula(P),Yes,No,No,No,4,4.4,Green,Very Good,270


In [44]:
df8 = pd.read_csv('date_column_ipl_data.csv',parse_dates=['date'])

In [45]:
df8['date'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 499 entries, 0 to 498
Series name: date
Non-Null Count  Dtype         
--------------  -----         
499 non-null    datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 4.0 KB


### 9. `na_values` parameter:

In [52]:
# converting any value of any column to NaN

df9 = pd.read_csv('StudentsPerformance.csv',na_values=['none',])
df9.sample(3)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
783,female,group C,associate's degree,standard,completed,52,59,62
857,female,group C,bachelor's degree,standard,,65,79,81
298,male,group C,high school,free/reduced,completed,40,46,50


### 10. `converters` parameter:

In [54]:
# created function to convert the values of any columns:

def change_name(x):
    if x == 'group A':
        return 'A'
    elif x == 'group B':
        return 'B'
    elif x == 'group C':
        return 'C'
    else:
        return x

In [57]:
change_name('group A') # test

'A'

In [59]:
df10 = pd.read_csv('StudentsPerformance.csv',converters={'race/ethnicity':change_name})

In [60]:
df10.head(2)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,B,bachelor's degree,standard,none,72,72,74
1,female,C,some college,standard,completed,69,90,88


### 11. Loading huge dataset:

In [61]:
df11 = pd.read_csv('different_encoding_data.csv',encoding='Latin-1',chunksize=2000)

In [62]:
df11 # objects

<pandas.io.parsers.readers.TextFileReader at 0x17751be4560>

In [63]:
for chunk in df11:
    print(chunk.shape)

(2000, 21)
(2000, 21)
(2000, 21)
(2000, 21)
(1551, 21)
