# [Polars vs Pandas Processing Time and RAM comparison](https://www.kaggle.com/najeebz/polaris-vs-pandas-performance-comparison)
## Dataset: [Child Mind Institute - Detect Sleep States](https://www.kaggle.com/competitions/child-mind-institute-detect-sleep-states/overview)
_______________________________________________________________________ 
# Author Details:
- Name: Najeeb Haider Zaidi
- Email: zaidi.nh@gmail.com
- Profiles: [Github](https://github.com/snajeebz)  [LinkedIn](https://www.linkedin.com/in/najeebz) [Kaggle](https://www.kaggle.com/najeebz)
- Copyrights 2023-2024 (c) are reserved only by the author: Najeeb Haider Zaidi
- Opensource; anyone can copy use and reuse the code and results.
________________________________________________________________________
# Attributions:
The Dataset has been provided by Child Mind Institute. in [Kaggle Competition](https://www.kaggle.com/competitions/child-mind-institute-detect-sleep-states/overview).
________________________________________________________________________
# Ways to Access/Execute:
- [Execute it in Kaggle](https://www.kaggle.com/najeebz/polars-vs-pandas-performance-comparison)
- [Github Repository for this and other Experimental Evaluations](https://github.com/snajeebz/Experiments-Practice)
_________________________________________________________________________
# Objective:

> We need to compare Pandas and Polars, we will run different operations through Pandas and Polars and generate a comparison report of the both at the end.

# Installing and Importing and Initializing Libraries.

In [1]:
import numpy as np # linear algebra
import pandas as pd# data processing, CSV file I/O (e.g. pd.read_csv)
import polars as pl
from datetime import datetime as dt
from tqdm import tqdm
import tracemalloc
tqdm.pandas()
pd.set_option('display.max_row', 500)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_colwidth', None)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/child-mind-institute-detect-sleep-states/train_series.parquet
/kaggle/input/child-mind-institute-detect-sleep-states/sample_submission.csv
/kaggle/input/child-mind-institute-detect-sleep-states/train_events.csv
/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet


# Creating a dataframe to store performance statistics

In [2]:
dict = {'function':['Reading','Windowing', 'Rolling', 'Merging', 'Clustering','Scaling'],
        'pandas_mem':[0.0,0,0,0,0,0.0],
        'pandas_duration':[0.0,0,0,0,0,0.0],
        'polars_mem': [0.0,0,0,0,0,0.0],
        'polars_duration':[0.0,0,0,0,0,0.0]
       }

stats = pd.DataFrame(dict)
stats


Unnamed: 0,function,pandas_mem,pandas_duration,polars_mem,polars_duration
0,Reading,0.0,0.0,0.0,0.0
1,Windowing,0.0,0.0,0.0,0.0
2,Rolling,0.0,0.0,0.0,0.0
3,Merging,0.0,0.0,0.0,0.0
4,Clustering,0.0,0.0,0.0,0.0
5,Scaling,0.0,0.0,0.0,0.0


# Functions using Pandas

# Windowing the training data
### As the target is to detect an event, 
### So, extracting timesteps before and after the events. 

In [3]:
def window(df, win_size):
    tracemalloc.start()
    global stats
    ind=df.index[df['event'].isna()==False]
    c=0
    df['window']=np.nan
    for i in tqdm(ind):
        a=i-win_size
        b=i+win_size
        df['window'].loc[a:i]=int(c)
        c=c+1
        df['window'].loc[i:b]=int(c)
        c=c+1
    df['window'].dropna(inplace=True)
    print( 'Memory Used: ',tracemalloc.get_traced_memory()[1]/1024/1024/1024)
    stats['pandas_mem'][stats['function']=='Windowing']=tracemalloc.get_traced_memory()[1]/1024/1024/1024
    tracemalloc.stop()
    return df[df['window'].isna()==False]

# Removing the steps where change in anglez is insignificant and enmo is zero (device is supposed not to be worn)

In [4]:
def inactive_periods(df):
    print("shape before application: ",df.shape)
    df['diff_anglez']=df['anglez'].diff()
    df=df[(df['enmo']!=0.0) | (df['diff_anglez']!=0.0)]
    print("shape after application: ",df.shape)
    df.drop('diff_anglez', inplace=True, axis=1)
    print("shape after completion: ",df.shape)
    print("removed ")
    return df


# Clustering the Enmo and Anglez

In [5]:
def clustering(df):
    tracemalloc.start()
    global stats
    from sklearn.cluster import KMeans
    from sklearn.preprocessing import StandardScaler    
    X=df[['anglez','enmo']]
#Scalling the data
    scaler=StandardScaler()
    X_scaled=scaler.fit_transform(X)
#perform clustering
    model=KMeans(n_clusters=4,algorithm="elkan" )
    model.fit(X_scaled)
    print( 'Memory Used GB: ',tracemalloc.get_traced_memory()[1]/1024/1024/1024)
    stats['pandas_mem'][stats['function']=='Clustering']=tracemalloc.get_traced_memory()[1]/1024/1024/1024
    tracemalloc.stop()
    return model.labels_


# Adding Rolling Standard Deviations

- As per observation enmo and anglez varies frequently and more than normal for few timesteps before and after the event.
- In order to consider the variations into the modelling, using the method.

In [6]:
 def rollingstd(series_df):
    tracemalloc.start()
    global stats
  # Creating columns with nans
    series_df['sd_enmo_1']=np.nan    # 1 min rolling std: enmo
    series_df['sd_anglez_1']=np.nan  # 1 min rolling std: anglez
    series_df['m_enmo_2']=np.nan     # 2 min rolling mean: enmo
    series_df['m_anglez_2']=np.nan   # 2 min rolling std: anglez 
    print('anglez rolling std 12')
    series_df['sd_anglez_1'] = (series_df.groupby('series_id')['anglez']
                      .rolling(12)
                      .std()
                      .reset_index(level=0, drop=True))
    print('anglez rolling std 2')
    series_df['sd_anglez_1'][series_df['sd_anglez_1'].isna()==True] = (series_df.groupby('series_id')['anglez']
                      .rolling(2)
                      .std()
                      .reset_index(level=0, drop=True))
    print('enmo rolling std 12')
    series_df['sd_enmo_1'] = (series_df.groupby('series_id')['enmo']
                      .rolling(12)
                      .std()
                      .reset_index(level=0, drop=True))
    print('enmo rolling mean 24')
    series_df['m_enmo_2'] = (series_df.groupby('series_id')['enmo']
                      .rolling(24)
                      .mean()
                      .reset_index(level=0, drop=True))
    print('anglez rolling mean 24')
    series_df['m_anglez_2'] = (series_df.groupby('series_id')['anglez']
                      .rolling(24)
                      .mean()
                      .reset_index(level=0, drop=True))
    print('enmo rolling std 2')
    print('Nans in sd_emno_1: ',series_df['sd_enmo_1'].isnull().sum())
    series_df['sd_enmo_1'][series_df['sd_enmo_1'].isna()==True] = (series_df.groupby('series_id')['enmo']
                      .rolling(2)
                      .std()
                      .reset_index(level=0, drop=True))
    print('enmo rolling mean 2')
    series_df['m_enmo_2'][series_df['m_enmo_2'].isna()==True] = (series_df.groupby('series_id')['enmo']
                      .rolling(2)
                      .mean()
                      .reset_index(level=0, drop=True))
    print('anglez rolling mean 2')
    series_df['m_anglez_2'][series_df['m_anglez_2'].isna()==True] = (series_df.groupby('series_id')['anglez']
                      .rolling(2)
                      .mean()
                      .reset_index(level=0, drop=True))
#Series wise rolling std and mean
# filling rest of nans
    print('Nans in sd_emno_1: ',series_df['sd_enmo_1'].isnull().sum())
    series_df['sd_enmo_1'].fillna(0.0, inplace=True)
    series_df['sd_anglez_1'].fillna(0.0, inplace=True)
    series_df['m_enmo_2'].fillna(0.0, inplace=True)
    series_df['m_anglez_2'].fillna(0.0, inplace=True)
    print('Nans after removal: ',series_df['sd_enmo_1'].isnull().sum())
    print( 'Memory Used GB: ',tracemalloc.get_traced_memory()[1]/1024/1024/1024)
    stats['pandas_mem'][stats['function']=='Rolling']=tracemalloc.get_traced_memory()[1]/1024/1024/1024
    tracemalloc.stop()
    return(series_df)


# Scaling the data

In [7]:
def scale(X):
    tracemalloc.start()
    global stats
    from sklearn import preprocessing
    scaler=preprocessing.StandardScaler().fit(X)
    print( 'Memory Used GB: ',tracemalloc.get_traced_memory()[1]/1024/1024/1024)
    stats['pandas_mem'][stats['function']=='Scaling']=tracemalloc.get_traced_memory()[1]/1024/1024/1024
    tracemalloc.stop()
    return (scaler.transform(X))


# Testing Pandas

In [8]:
# Importing the datasets
print('Importing Training Datasets')
tnow=dt.now()
tracemalloc.start()
df_series=pd.read_parquet(path="/kaggle/input/child-mind-institute-detect-sleep-states/train_series.parquet", engine='auto')
df_events=pd.read_csv("/kaggle/input/child-mind-institute-detect-sleep-states/train_events.csv")
print('Datasets Imported...')
print('Time Taken: ',dt.now()-tnow)
stats['pandas_duration'][stats['function']=='Reading']=dt.now()-tnow
stats['pandas_mem'][stats['function']=='Reading']=tracemalloc.get_traced_memory()[1]/1024/1024/1024
print( 'Memory Used GB: ',tracemalloc.get_traced_memory()[1]/1024/1024/1024)

tracemalloc.stop()
print('______________________________________')

# Merging the datasets
print('Merging the training datasets...')
tnow=dt.now()
tracemalloc.start()
events=df_events[['series_id', 'step','event']]
series_df=pd.merge(df_series,events,on=["step","series_id"],how='left')
df_series=[]
df_event=[]
series_df['sleep']=np.nan
series_df.loc[series_df["event"]=="onset", "sleep"] = 1
series_df.loc[series_df["event"]=="wakeup", "sleep"] = 0
series_df['sleep'].fillna(method='ffill', inplace=True)
series_df['sleep'].fillna(value=0, inplace=True)
print('Datasets Merged...')
print('Time Taken: ',dt.now()-tnow)
print( 'Memory Used: ',tracemalloc.get_traced_memory()[1]/1024/1024/1024)
stats['pandas_duration'][stats['function']=='Merging']=dt.now()-tnow
stats['pandas_mem'][stats['function']=='Merging']=tracemalloc.get_traced_memory()[1]/1024/1024/1024
tracemalloc.stop()
print('______________________________________')

# Removing the periods of inactivity
print('Removing the periods of Inactivity...')
series_df=inactive_periods(series_df)
print('______________________________________')
# Forming Windows
win_size=720  #60mins
tnow=dt.now()
print('Creating Windows each size: ',win_size)
series_df=window(series_df,win_size)
print('Windows formed...')
print('Time Taken: ',dt.now()-tnow)
stats['pandas_duration'][stats['function']=='Windowing']=dt.now()-tnow

print('______________________________________')

# Adding the columns of Standard Deviation (1 min)
tnow=dt.now()
print('Adding columns to account for deviation in enmo and anglez 1 min rolling...')
series_df=rollingstd(series_df)
series_df['sd_anglez_1']=pd.to_numeric(series_df['sd_anglez_1'])
series_df['sd_enmo_1']=pd.to_numeric(series_df['sd_enmo_1'])
series_df['m_anglez_2']=pd.to_numeric(series_df['m_anglez_2'])
series_df['m_enmo_2']=pd.to_numeric(series_df['m_enmo_2'])
print('Std columns added...')
print('Time Taken: ',dt.now()-tnow)
stats['pandas_duration'][stats['function']=='Rolling']=dt.now()-tnow
print('______________________________________')

# Clustering the Data
tnow=dt.now()
print('Clustering the data based on enmo and anglez...')
series_df['cluster']=(clustering(series_df)+1)/4
print('Time Taken: ',dt.now()-tnow)
stats['pandas_duration'][stats['function']=='Clustering']=dt.now()-tnow
print('Added clusters...')
print('______________________________________')
stats
# Creating dataframes for training
X=series_df[['sd_anglez_1','sd_enmo_1','anglez','m_anglez_2','m_enmo_2','enmo','cluster']]
y=series_df[['sleep']]
X=scale(X)
print('Time Taken: ',dt.now()-tnow)
stats['pandas_duration'][stats['function']=='Scaling']=dt.now()-tnow


Importing Training Datasets
Datasets Imported...
Time Taken:  0:01:12.574773
Memory Used GB:  2.737819992005825
______________________________________
Merging the training datasets...
Datasets Merged...
Time Taken:  0:02:11.070029
Memory Used:  10.486951594240963
______________________________________
Removing the periods of Inactivity...
shape before application:  (127946340, 7)
shape after application:  (111766109, 8)
shape after completion:  (111766109, 7)
removed 
______________________________________
Creating Windows each size:  720


100%|██████████| 6928/6928 [00:16<00:00, 415.80it/s]


Memory Used:  1.0660394178703427
Windows formed...
Time Taken:  0:00:24.805480
______________________________________
Adding columns to account for deviation in enmo and anglez 1 min rolling...
anglez rolling std 12
anglez rolling std 2
enmo rolling std 12
enmo rolling mean 24
anglez rolling mean 24
enmo rolling std 2
Nans in sd_emno_1:  2959
enmo rolling mean 2
anglez rolling mean 2
Nans in sd_emno_1:  269
Nans after removal:  0
Memory Used GB:  1.268339148722589
Std columns added...
Time Taken:  0:01:04.363566
______________________________________
Clustering the data based on enmo and anglez...
Memory Used GB:  0.5817822804674506
Time Taken:  0:01:40.868780
Added clusters...
______________________________________
Memory Used GB:  0.9498797813430429
Time Taken:  0:01:42.322946


In [9]:
stats.head()

Unnamed: 0,function,pandas_mem,pandas_duration,polars_mem,polars_duration
0,Reading,2.73782,0:01:12.574996,0.0,0.0
1,Windowing,1.066039,0:00:24.805578,0.0,0.0
2,Rolling,1.268339,0:01:04.363659,0.0,0.0
3,Merging,10.486952,0:02:11.070406,0.0,0.0
4,Clustering,0.581782,0:01:40.868976,0.0,0.0
5,Scaling,0.94988,0:01:42.323198,0.0,0.0


In [31]:
stats['pandas_duration']=stats['pandas_duration'].dt.total_seconds()


In [34]:
stats

Unnamed: 0,function,pandas_mem,pandas_duration,polars_mem,polars_duration
0,Reading,2.73782,72.574996,0.0,0.0
1,Windowing,1.066039,24.805578,0.0,0.0
2,Rolling,1.268339,64.363659,0.0,0.0
3,Merging,10.486952,131.070406,0.0,0.0
4,Clustering,0.581782,100.868976,0.0,0.0
5,Scaling,0.94988,102.323198,0.0,0.0


# Polars Functions

In [35]:
def window_polars(df, win_size):
    tracemalloc.stop()
    ind=df.index[df['event'].isna()==False]
    c=0
    df['window']=pl.nan
    for i in tqdm(ind):
        a=i-win_size
        b=i+win_size
        df['window'].loc[a:i]=int(c)
        c=c+1
        df['window'].loc[i:b]=int(c)
        c=c+1
    df['window'].dropna(inplace=True)
    print( 'Memory Used GB: ',tracemalloc.get_traced_memory()[1]/1024/1024/1024)
    stats['pandas_mem'][stats['function']=='Clustering']=tracemalloc.get_traced_memory()[1]/1024/1024/1024
    tracemalloc.stop()
    return df[df['window'].is_null()==False]

In [None]:
def clustering(df):
    tracemalloc.start()
    global stats
    from sklearn.cluster import KMeans
    from sklearn.preprocessing import StandardScaler    
    X=df[['anglez','enmo']]
#Scalling the data
    scaler=StandardScaler()
    X_scaled=scaler.fit_transform(X)
#perform clustering
    model=KMeans(n_clusters=4,algorithm="elkan" )
    model.fit(X_scaled)
    print( 'Memory Used GB: ',tracemalloc.get_traced_memory()[1]/1024/1024/1024)
    stats['pandas_mem'][stats['function']=='Clustering']=tracemalloc.get_traced_memory()[1]/1024/1024/1024
    tracemalloc.stop()
    return model.labels_