In [1]:
import pandas as pd
import polars as pl

In [4]:
# Pandas
panda_df = pd.read_csv('eda_ds.csv')
panda_df.head()

Unnamed: 0,tu_fico_9,tradeline_count,inquiry_count,collection_count,revolving_utilization,delinquent_secured,delinquent_unsecured,bankruptcy_count,stated_indiv_income,stated_additional_income,tu_income_estimate,tu_monthly_payments,tu_mortgage_payments,riskview,principal,cpb,days_delinquent,payment_history
0,498,1,0,1,0,0,0,0,24000,0,35,46,0,,,,,open;2019-05-10;2021-04-03;1543;0;0;1543;indiv...
1,612,7,5,1,63,0,0,0,30000,0,29,126,0,706.0,,,,installment;2019-10-05;2019-11-03;0;352;0;0;in...
2,617,21,8,0,32,0,0,0,60000,0,39,1186,1075,779.0,,,,installment;2013-06-11;2016-10-11;0;5000;0;0;i...
3,647,10,1,0,64,0,0,0,400000,0,68,649,0,724.0,,,,installment;2009-09-18;2013-06-24;0;35599;0;0;...
4,647,16,7,4,56,0,0,0,65000,0,55,1074,0,660.0,,,,installment;2007-10-19;2011-09-26;0;14162;0;0;...


In [5]:
# Polars
polar_df = pl.read_csv('eda_ds.csv')
polar_df.head()

tu_fico_9,tradeline_count,inquiry_count,collection_count,revolving_utilization,delinquent_secured,delinquent_unsecured,bankruptcy_count,stated_indiv_income,stated_additional_income,tu_income_estimate,tu_monthly_payments,tu_mortgage_payments,riskview,principal,cpb,days_delinquent,payment_history
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,i64,str
498,1,0,1,0,0,0,0,24000,0,35,46,0,,,,,"""open;2019-05-1…"
612,7,5,1,63,0,0,0,30000,0,29,126,0,706.0,,,,"""installment;20…"
617,21,8,0,32,0,0,0,60000,0,39,1186,1075,779.0,,,,"""installment;20…"
647,10,1,0,64,0,0,0,400000,0,68,649,0,724.0,,,,"""installment;20…"
647,16,7,4,56,0,0,0,65000,0,55,1074,0,660.0,,,,"""installment;20…"


## Selecting and Filtering Data
The first major difference between Pandas and Polars is that Polars does not use an index [1]. Instead, each row is indexed by its integer position in the DataFrame [1].

In [6]:
# Pandas
panda_df[['tu_fico_9', 'stated_indiv_income']] 

Unnamed: 0,tu_fico_9,stated_indiv_income
0,498,24000
1,612,30000
2,617,60000
3,647,400000
4,647,65000
...,...,...
94,776,30000
95,723,35000
96,575,25000
97,711,32000


In [7]:
# The above code will run with Polars as well, 
# but the correct way in Polars is:
polar_df.select(pl.col(['tu_fico_9', 'stated_indiv_income'])) 

tu_fico_9,stated_indiv_income
i64,i64
498,24000
612,30000
617,60000
647,400000
647,65000
796,70000
806,27000
471,70000
762,81000
709,107000


While you would use the .query() method in Pandas to filter data, you need to use the .filter() method in Polars.

In [8]:
# Pandas
panda_df.query('stated_indiv_income > 20000')

Unnamed: 0,tu_fico_9,tradeline_count,inquiry_count,collection_count,revolving_utilization,delinquent_secured,delinquent_unsecured,bankruptcy_count,stated_indiv_income,stated_additional_income,tu_income_estimate,tu_monthly_payments,tu_mortgage_payments,riskview,principal,cpb,days_delinquent,payment_history
0,498,1,0,1,0,0,0,0,24000,0,35,46,0,,,,,open;2019-05-10;2021-04-03;1543;0;0;1543;indiv...
1,612,7,5,1,63,0,0,0,30000,0,29,126,0,706.0,,,,installment;2019-10-05;2019-11-03;0;352;0;0;in...
2,617,21,8,0,32,0,0,0,60000,0,39,1186,1075,779.0,,,,installment;2013-06-11;2016-10-11;0;5000;0;0;i...
3,647,10,1,0,64,0,0,0,400000,0,68,649,0,724.0,,,,installment;2009-09-18;2013-06-24;0;35599;0;0;...
4,647,16,7,4,56,0,0,0,65000,0,55,1074,0,660.0,,,,installment;2007-10-19;2011-09-26;0;14162;0;0;...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,776,15,0,0,1,0,0,0,30000,0,67,175,0,702.0,2517.0,,,revolving;2000-01-27;2021-04-25;78;5674;11500;...
95,723,18,1,0,10,0,0,0,35000,0,56,193,0,722.0,,,,installment;2004-08-20;2021-03-31;3612;12462;0...
96,575,1,16,2,0,0,0,0,25000,0,34,292,0,,,,,installment;2020-03-04;2021-03-31;9115;9716;0;...
97,711,5,4,0,71,0,0,0,32000,80000,42,173,0,755.0,19795.0,19423.22,0.0,installment;2017-02-27;2019-09-04;0;18506;0;0;...


In [9]:
# Polars
polar_df.filter(pl.col('stated_indiv_income') > 20000)

tu_fico_9,tradeline_count,inquiry_count,collection_count,revolving_utilization,delinquent_secured,delinquent_unsecured,bankruptcy_count,stated_indiv_income,stated_additional_income,tu_income_estimate,tu_monthly_payments,tu_mortgage_payments,riskview,principal,cpb,days_delinquent,payment_history
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,i64,str
498,1,0,1,0,0,0,0,24000,0,35,46,0,,,,,"""open;2019-05-1…"
612,7,5,1,63,0,0,0,30000,0,29,126,0,706,,,,"""installment;20…"
617,21,8,0,32,0,0,0,60000,0,39,1186,1075,779,,,,"""installment;20…"
647,10,1,0,64,0,0,0,400000,0,68,649,0,724,,,,"""installment;20…"
647,16,7,4,56,0,0,0,65000,0,55,1074,0,660,,,,"""installment;20…"
796,10,1,0,13,0,0,0,70000,0,57,101,0,703,,,,"""installment;20…"
806,9,0,0,0,0,0,0,27000,0,63,13,0,682,,,,"""installment;20…"
471,3,0,0,98,0,0,0,70000,0,40,52,0,,,,,"""installment;20…"
762,15,12,0,3,0,0,0,81000,0,61,1645,0,656,22500,,,"""installment;20…"
709,20,2,0,41,0,0,0,107000,0,95,1849,0,771,,,,"""installment;20…"


In speed comparison we get about same time for Pandas and Polars.But, in contrast to Pandas, Polars can run operations in .select() and .filter() in parallel.

## Creating New Columns
In Polars, you need to use the .with_column() or the .with_columns() method depending on how many columns you want to create.

In [11]:
# Pandas
panda_df["updated_income"] = panda_df["stated_indiv_income"] * 1.20
panda_df.head()

Unnamed: 0,tu_fico_9,tradeline_count,inquiry_count,collection_count,revolving_utilization,delinquent_secured,delinquent_unsecured,bankruptcy_count,stated_indiv_income,stated_additional_income,tu_income_estimate,tu_monthly_payments,tu_mortgage_payments,riskview,principal,cpb,days_delinquent,payment_history,updated_income
0,498,1,0,1,0,0,0,0,24000,0,35,46,0,,,,,open;2019-05-10;2021-04-03;1543;0;0;1543;indiv...,28800.0
1,612,7,5,1,63,0,0,0,30000,0,29,126,0,706.0,,,,installment;2019-10-05;2019-11-03;0;352;0;0;in...,36000.0
2,617,21,8,0,32,0,0,0,60000,0,39,1186,1075,779.0,,,,installment;2013-06-11;2016-10-11;0;5000;0;0;i...,72000.0
3,647,10,1,0,64,0,0,0,400000,0,68,649,0,724.0,,,,installment;2009-09-18;2013-06-24;0;35599;0;0;...,480000.0
4,647,16,7,4,56,0,0,0,65000,0,55,1074,0,660.0,,,,installment;2007-10-19;2011-09-26;0;14162;0;0;...,78000.0


In [13]:
# Polars
polar_df = polar_df.with_columns([(pl.col("stated_indiv_income") * 1.20).alias("updated_income")])
polar_df.head()

# Polars for multiple columns
# df.with_columns([(pl.col("col") * 10).alias("new_col"), ...])

tu_fico_9,tradeline_count,inquiry_count,collection_count,revolving_utilization,delinquent_secured,delinquent_unsecured,bankruptcy_count,stated_indiv_income,stated_additional_income,tu_income_estimate,tu_monthly_payments,tu_mortgage_payments,riskview,principal,cpb,days_delinquent,payment_history,updated_income
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,i64,str,f64
498,1,0,1,0,0,0,0,24000,0,35,46,0,,,,,"""open;2019-05-1…",28800.0
612,7,5,1,63,0,0,0,30000,0,29,126,0,706.0,,,,"""installment;20…",36000.0
617,21,8,0,32,0,0,0,60000,0,39,1186,1075,779.0,,,,"""installment;20…",72000.0
647,10,1,0,64,0,0,0,400000,0,68,649,0,724.0,,,,"""installment;20…",480000.0
647,16,7,4,56,0,0,0,65000,0,55,1074,0,660.0,,,,"""installment;20…",78000.0


## Grouping and Aggregation
Grouping and aggregation are slightly different between Pandas and Polars syntax-wise, but both use the .groupby() and .agg() methods.

In [14]:
# Pandas
df = panda_df.groupby('tradeline_count')['stated_indiv_income'].agg('mean')
df.head()

tradeline_count
1    27346.000000
2    32160.000000
3    83666.666667
4    35000.000000
5    45000.000000
Name: stated_indiv_income, dtype: float64

In [16]:
# Polars
# df.groupby('col1').agg([pl.col('col2').mean()]) # As suggested in Polars docs
df = polar_df.groupby('tradeline_count').agg([pl.mean('stated_indiv_income')]) # Shorter
df.head()

tradeline_count,stated_indiv_income
i64,f64
22,98400.0
56,145000.0
13,64333.333333
9,46422.857143
30,31275.0


## Missing Data
Another major difference between Pandas and Polars is that Pandas uses NaN values to indicate missing values, while Polars uses null [1].
Thus, instead of the .fillna() method in Pandas, you should use the .fill_null() method in Polars.

In [18]:
# Pandas
df = panda_df['revolving_utilization'].fillna(-999)
df.head()

0     0
1    63
2    32
3    64
4    56
Name: revolving_utilization, dtype: int64

In [21]:
# Polars
# df_pd.with_column(pl.col('col2').fill_null(pl.lit(-999))) # As suggested in Polars docs
df = polar_df.with_columns(pl.col('revolving_utilization').fill_null(-999)) # Shorter
df.head()

tu_fico_9,tradeline_count,inquiry_count,collection_count,revolving_utilization,delinquent_secured,delinquent_unsecured,bankruptcy_count,stated_indiv_income,stated_additional_income,tu_income_estimate,tu_monthly_payments,tu_mortgage_payments,riskview,principal,cpb,days_delinquent,payment_history,updated_income
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,i64,str,f64
498,1,0,1,0,0,0,0,24000,0,35,46,0,,,,,"""open;2019-05-1…",28800.0
612,7,5,1,63,0,0,0,30000,0,29,126,0,706.0,,,,"""installment;20…",36000.0
617,21,8,0,32,0,0,0,60000,0,39,1186,1075,779.0,,,,"""installment;20…",72000.0
647,10,1,0,64,0,0,0,400000,0,68,649,0,724.0,,,,"""installment;20…",480000.0
647,16,7,4,56,0,0,0,65000,0,55,1074,0,660.0,,,,"""installment;20…",78000.0
