# Use read_csv and skiprows with condition based on values in Pandas

https://datascientyst.com/read_csv-skip-rows-condition-values-pandas/

In [1]:
import pandas as pd
csv_file = '../data/StudentsPerformance.csv'
df = pd.read_csv(csv_file)
print(df.shape)
df

(1000, 8)


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


## Step 1: Read CSV file skip rows with query condition

In [2]:
gen = pd.read_csv(csv_file, chunksize=10000000)
df = pd.concat((x.query("lunch == 'standard'") for x in gen), ignore_index=True)
df

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group C,some college,standard,none,76,78,75
4,female,group B,associate's degree,standard,none,71,83,78
...,...,...,...,...,...,...,...,...
640,male,group E,some high school,standard,completed,81,75,76
641,female,group B,some high school,standard,completed,65,82,78
642,male,group A,high school,standard,none,63,63,62
643,female,group E,master's degree,standard,completed,88,99,95


## Step 2: Read CSV file with condition value higher than treshold

In [3]:
schema={
    "math score": int
}
gen = pd.read_csv(csv_file, dtype=schema, chunksize=10000000)
df = pd.concat((x.query("`math score` >= 75") for x in gen), ignore_index=True)
df

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,master's degree,standard,none,90,95,93
1,male,group C,some college,standard,none,76,78,75
2,female,group B,some college,standard,completed,88,95,92
3,male,group A,some college,standard,completed,78,72,70
4,male,group C,high school,standard,none,88,89,86
...,...,...,...,...,...,...,...,...
290,female,group A,some college,standard,completed,78,87,91
291,male,group E,some high school,standard,completed,81,75,76
292,male,group E,high school,free/reduced,completed,86,81,75
293,female,group E,master's degree,standard,completed,88,99,95


## Step 3: Read CSV file and post filter condition

In [4]:
df = pd.read_csv(csv_file)

df = df[df['lunch'] != 'standard']
df

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
3,male,group A,associate's degree,free/reduced,none,47,57,44
7,male,group B,some college,free/reduced,none,40,43,39
8,male,group D,high school,free/reduced,completed,64,64,67
9,female,group B,high school,free/reduced,none,38,60,50
17,female,group B,some high school,free/reduced,none,18,32,28
...,...,...,...,...,...,...,...,...
992,female,group D,associate's degree,free/reduced,none,55,76,76
993,female,group D,bachelor's degree,free/reduced,none,62,72,74
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65


## Step 4: Read CSV with conditional filtering by Dask  

In [5]:
import dask.dataframe as dd

df = dd.read_csv(csv_file)

df = df[(df['reading score'] >= 55) & (df['reading score'] <= 75)]

df = df.compute()
df

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
3,male,group A,associate's degree,free/reduced,none,47,57,44
8,male,group D,high school,free/reduced,completed,64,64,67
9,female,group B,high school,free/reduced,none,38,60,50
13,male,group A,some college,standard,completed,78,72,70
...,...,...,...,...,...,...,...,...
987,male,group E,some high school,standard,completed,81,75,76
993,female,group D,bachelor's degree,free/reduced,none,62,72,74
994,male,group A,high school,standard,none,63,63,62
996,male,group C,high school,free/reduced,none,62,55,55
