<a href="https://colab.research.google.com/github/stephenindia1/Python-Faker/blob/main/Colab_Polars_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Polar:**
written in Rust, which allows for highly optimized and efficient execution, particularly for large datasets. It leverages multi-threading and optimized query plans for faster data processing.

In [None]:
import pandas as pd
import polars as pl
import numpy as np
import timeit
import os

# --- 1. Create a sample Parquet file ---
file_path = '/content/drive/MyDrive/Colab Notebooks/daily_weather.parquet'
if not os.path.exists(file_path):
    # Create a large DataFrame for a meaningful comparison
    data = {
        'id': np.arange(1000000),
        'value': np.random.rand(1000000),
        'category': np.random.choice(['A', 'B', 'C'], size=1000000)
    }
    df_create = pd.DataFrame(data)
    df_create.to_parquet(file_path, index=False)
    print(f"Created a sample parquet file: {file_path}\n")

# --- 2. Define functions for timeit ---
def read_pandas():
    """Function to read the parquet file using pandas."""
    # Ensure pandas uses pyarrow engine for better performance and compatibility
    df = pd.read_parquet(file_path, engine='pyarrow')
    return df

def read_polars_eager():
    """Function to read the parquet file using Polars (eagerly)."""
    df = pl.read_parquet(file_path)
    return df

def read_polars_lazy():
    """Function to read the parquet file using Polars (lazily and collect)."""
    df = pl.scan_parquet(file_path).collect()
    return df

# --- 3. Time the operations ---
# The timeit module runs the function multiple times and provides an average/best time.

# Time Pandas read_parquet
pandas_time = timeit.timeit(read_pandas, number=10) # Run 10 times
print(f"Pandas read_parquet time (avg over 10 runs): {pandas_time:.4f} seconds")

# Time Polars eager read_parquet
polars_eager_time = timeit.timeit(read_polars_eager, number=10) # Run 10 times
print(f"Polars eager read_parquet time (avg over 10 runs): {polars_eager_time:.4f} seconds")

# Time Polars lazy scan_parquet and collect
polars_lazy_time = timeit.timeit(read_polars_lazy, number=10) # Run 10 times
print(f"Polars lazy scan_parquet time (avg over 10 runs): {polars_lazy_time:.4f} seconds")

# --- 4. Clean up (optional) ---
# os.remove(file_path)
# print(f"\nRemoved the sample file: {file_path}")


Pandas read_parquet time (avg over 10 runs): 66.4953 seconds
Polars eager read_parquet time (avg over 10 runs): 33.3602 seconds
Polars lazy scan_parquet time (avg over 10 runs): 32.3216 seconds


**14 of the most used Polars functions** - explained with examples

In [None]:
import polars as pl
import numpy as np

# Sample Data
data = {
    "name": ["Alice", "Bob", "Charlie", "David", "Eve"],
    "age": [25, 30, 35, 40, 45],
    "city": ["New York", "Paris", "London", "New York", "London"],
    "score": [85.5, 72.0, 91.2, 88.0, 76.5],
    "id": [1, 2, 3, 4, 5]
}
df = pl.DataFrame(data)
print("Original DataFrame:")
print(df)
print("-" * 30)

Original DataFrame:
shape: (5, 5)
┌─────────┬─────┬──────────┬───────┬─────┐
│ name    ┆ age ┆ city     ┆ score ┆ id  │
│ ---     ┆ --- ┆ ---      ┆ ---   ┆ --- │
│ str     ┆ i64 ┆ str      ┆ f64   ┆ i64 │
╞═════════╪═════╪══════════╪═══════╪═════╡
│ Alice   ┆ 25  ┆ New York ┆ 85.5  ┆ 1   │
│ Bob     ┆ 30  ┆ Paris    ┆ 72.0  ┆ 2   │
│ Charlie ┆ 35  ┆ London   ┆ 91.2  ┆ 3   │
│ David   ┆ 40  ┆ New York ┆ 88.0  ┆ 4   │
│ Eve     ┆ 45  ┆ London   ┆ 76.5  ┆ 5   │
└─────────┴─────┴──────────┴───────┴─────┘
------------------------------


In [None]:
# pl.DataFrame() - is used to construct a Polars DataFrame from various data structures like dictionaries, lists, or NumPy arrays

df = pl.DataFrame(data)

In [None]:
# head() - useful for quickly inspecting the top records of a DataFrame
# Example: Get the first 3 rows
print("df.head(3):")
print(df.head(3))
print("-" * 30)

df.head(3):
shape: (3, 5)
┌─────────┬─────┬──────────┬───────┬─────┐
│ name    ┆ age ┆ city     ┆ score ┆ id  │
│ ---     ┆ --- ┆ ---      ┆ ---   ┆ --- │
│ str     ┆ i64 ┆ str      ┆ f64   ┆ i64 │
╞═════════╪═════╪══════════╪═══════╪═════╡
│ Alice   ┆ 25  ┆ New York ┆ 85.5  ┆ 1   │
│ Bob     ┆ 30  ┆ Paris    ┆ 72.0  ┆ 2   │
│ Charlie ┆ 35  ┆ London   ┆ 91.2  ┆ 3   │
└─────────┴─────┴──────────┴───────┴─────┘
------------------------------


In [None]:
# filter() - Filters the DataFrame rows using a boolean expression (predicate expression)

# Example: Filter for people older than 30

filtered_df = df.filter(pl.col("age") > 30)
print("df.filter(pl.col('age') > 30):")
print(filtered_df)
print("-" * 30)

df.filter(pl.col('age') > 30):
shape: (3, 5)
┌─────────┬─────┬──────────┬───────┬─────┐
│ name    ┆ age ┆ city     ┆ score ┆ id  │
│ ---     ┆ --- ┆ ---      ┆ ---   ┆ --- │
│ str     ┆ i64 ┆ str      ┆ f64   ┆ i64 │
╞═════════╪═════╪══════════╪═══════╪═════╡
│ Charlie ┆ 35  ┆ London   ┆ 91.2  ┆ 3   │
│ David   ┆ 40  ┆ New York ┆ 88.0  ┆ 4   │
│ Eve     ┆ 45  ┆ London   ┆ 76.5  ┆ 5   │
└─────────┴─────┴──────────┴───────┴─────┘
------------------------------


In [None]:
# select() - Selects a subset of columns from the DataFrame

# Example: Select only "name" and "city" columns
selected_cols = df.select(["name", "city"])
print("df.select(['name', 'city']):")
print(selected_cols)
print("-" * 30)

df.select(['name', 'city']):
shape: (5, 2)
┌─────────┬──────────┐
│ name    ┆ city     │
│ ---     ┆ ---      │
│ str     ┆ str      │
╞═════════╪══════════╡
│ Alice   ┆ New York │
│ Bob     ┆ Paris    │
│ Charlie ┆ London   │
│ David   ┆ New York │
│ Eve     ┆ London   │
└─────────┴──────────┘
------------------------------


In [None]:
# with_columns() - Used to add new columns to the DataFrame or transform existing ones using expressions

# Example: Add a new column "age_plus_10"
df_with_new_col = df.with_columns(
    (pl.col("age") + 10).alias("age_plus_10")
)
print("df.with_columns(age_plus_10):")
print(df_with_new_col)
print("-" * 30)

df.with_columns(age_plus_10):
shape: (5, 6)
┌─────────┬─────┬──────────┬───────┬─────┬─────────────┐
│ name    ┆ age ┆ city     ┆ score ┆ id  ┆ age_plus_10 │
│ ---     ┆ --- ┆ ---      ┆ ---   ┆ --- ┆ ---         │
│ str     ┆ i64 ┆ str      ┆ f64   ┆ i64 ┆ i64         │
╞═════════╪═════╪══════════╪═══════╪═════╪═════════════╡
│ Alice   ┆ 25  ┆ New York ┆ 85.5  ┆ 1   ┆ 35          │
│ Bob     ┆ 30  ┆ Paris    ┆ 72.0  ┆ 2   ┆ 40          │
│ Charlie ┆ 35  ┆ London   ┆ 91.2  ┆ 3   ┆ 45          │
│ David   ┆ 40  ┆ New York ┆ 88.0  ┆ 4   ┆ 50          │
│ Eve     ┆ 45  ┆ London   ┆ 76.5  ┆ 5   ┆ 55          │
└─────────┴─────┴──────────┴───────┴─────┴─────────────┘
------------------------------


In [None]:
# group_by() - Groups data by one or more columns to perform aggregations like sum, mean, etc

# Example: Calculate the mean score by city
grouped_df = df.group_by("city").agg(
    pl.col("score").mean().alias("average_score")
)
print("df.group_by('city').agg(pl.col('score').mean()):")
print(grouped_df)
print("-" * 30)

df.group_by('city').agg(pl.col('score').mean()):
shape: (3, 2)
┌──────────┬───────────────┐
│ city     ┆ average_score │
│ ---      ┆ ---           │
│ str      ┆ f64           │
╞══════════╪═══════════════╡
│ New York ┆ 86.75         │
│ London   ┆ 83.85         │
│ Paris    ┆ 72.0          │
└──────────┴───────────────┘
------------------------------


In [None]:
# sort() - Sorts the DataFrame rows based on the values in specified columns

# Example: Sort by "age" in descending order
sorted_df = df.sort("age", descending=True)
print("df.sort('age', descending=True):")
print(sorted_df)
print("-" * 30)

df.sort('age', descending=True):
shape: (5, 5)
┌─────────┬─────┬──────────┬───────┬─────┐
│ name    ┆ age ┆ city     ┆ score ┆ id  │
│ ---     ┆ --- ┆ ---      ┆ ---   ┆ --- │
│ str     ┆ i64 ┆ str      ┆ f64   ┆ i64 │
╞═════════╪═════╪══════════╪═══════╪═════╡
│ Eve     ┆ 45  ┆ London   ┆ 76.5  ┆ 5   │
│ David   ┆ 40  ┆ New York ┆ 88.0  ┆ 4   │
│ Charlie ┆ 35  ┆ London   ┆ 91.2  ┆ 3   │
│ Bob     ┆ 30  ┆ Paris    ┆ 72.0  ┆ 2   │
│ Alice   ┆ 25  ┆ New York ┆ 85.5  ┆ 1   │
└─────────┴─────┴──────────┴───────┴─────┘
------------------------------


In [None]:
# join() - Combines two DataFrames using a common column (key), similar to SQL JOIN operations


# Second DataFrame for joining
data2 = {
    "status": ["Active", "Active", "Inactive", "Active", ""],
    "id": [1, 2, 3, 4, 5]
}
df2 = pl.DataFrame(data2)

# Example: Inner join on "id"
joined_df = df.join(df2, on="id", how="inner")
print("df.join(df2, on='id', how='inner'):")
print(joined_df)
print("-" * 30)

df.join(df2, on='id', how='inner'):
shape: (5, 6)
┌─────────┬─────┬──────────┬───────┬─────┬──────────┐
│ name    ┆ age ┆ city     ┆ score ┆ id  ┆ status   │
│ ---     ┆ --- ┆ ---      ┆ ---   ┆ --- ┆ ---      │
│ str     ┆ i64 ┆ str      ┆ f64   ┆ i64 ┆ str      │
╞═════════╪═════╪══════════╪═══════╪═════╪══════════╡
│ Alice   ┆ 25  ┆ New York ┆ 85.5  ┆ 1   ┆ Active   │
│ Bob     ┆ 30  ┆ Paris    ┆ 72.0  ┆ 2   ┆ Active   │
│ Charlie ┆ 35  ┆ London   ┆ 91.2  ┆ 3   ┆ Inactive │
│ David   ┆ 40  ┆ New York ┆ 88.0  ┆ 4   ┆ Active   │
│ Eve     ┆ 45  ┆ London   ┆ 76.5  ┆ 5   ┆          │
└─────────┴─────┴──────────┴───────┴─────┴──────────┘
------------------------------


In [None]:
# fill_null() - Fills missing (null) values in the DataFrame with a specified value or strategy (e.g., forward, backward, mean, zero)

# Create a DataFrame with a null value
df_nulls = pl.DataFrame({"A": [1, 2, None, 4]})

# Example: Fill nulls with 0
df_filled = df_nulls.fill_null(0)
print("df_nulls.fill_null(0):")
print(df_filled)
print("-" * 30)

df_nulls.fill_null(0):
shape: (4, 1)
┌─────┐
│ A   │
│ --- │
│ i64 │
╞═════╡
│ 1   │
│ 2   │
│ 0   │
│ 4   │
└─────┘
------------------------------


In [None]:
# drop_nulls() - Removes rows from the DataFrame that contain null values in any (or specified) column(s)

# Example: Drop rows with any null values
df_dropped = df_nulls.drop_nulls()
print("df_nulls.drop_nulls():")
print(df_dropped)
print("-" * 30)

df_nulls.drop_nulls():
shape: (3, 1)
┌─────┐
│ A   │
│ --- │
│ i64 │
╞═════╡
│ 1   │
│ 2   │
│ 4   │
└─────┘
------------------------------


In [None]:
# unique() / drop_duplicates() - Removes duplicate rows from the DataFrame

# Example data with duplicate
df_dups = pl.DataFrame({
    "a": [1, 2, 2, 3, 3, 4],
    "b": ["x", "y", "y", "z", "z", "w"],
    "c": [10, 20, 20, 30, 40, 50]
})

df_dups_unique = df_dups.unique()
print("DataFrame after removing duplicates based on all columns:")
print(df_dups_unique)

# Remove duplicates based on columns 'a' and 'b'
df_unique_subset = df_dups.unique(subset=["a", "b"])
print("\nDataFrame after removing duplicates based on 'a' and 'b':")
print(df_unique_subset)

DataFrame after removing duplicates based on all columns:
shape: (5, 3)
┌─────┬─────┬─────┐
│ a   ┆ b   ┆ c   │
│ --- ┆ --- ┆ --- │
│ i64 ┆ str ┆ i64 │
╞═════╪═════╪═════╡
│ 3   ┆ z   ┆ 30  │
│ 2   ┆ y   ┆ 20  │
│ 3   ┆ z   ┆ 40  │
│ 4   ┆ w   ┆ 50  │
│ 1   ┆ x   ┆ 10  │
└─────┴─────┴─────┘

DataFrame after removing duplicates based on 'a' and 'b':
shape: (4, 3)
┌─────┬─────┬─────┐
│ a   ┆ b   ┆ c   │
│ --- ┆ --- ┆ --- │
│ i64 ┆ str ┆ i64 │
╞═════╪═════╪═════╡
│ 1   ┆ x   ┆ 10  │
│ 2   ┆ y   ┆ 20  │
│ 3   ┆ z   ┆ 30  │
│ 4   ┆ w   ┆ 50  │
└─────┴─────┴─────┘


In [None]:
# cast() - Changes the data type (dtype) of a column to another compatible type

# Example: Cast "age" from Int64 to Float64
df_cast = df.with_columns(pl.col("age").cast(pl.Float64))
print("df.with_columns(pl.col('age').cast(pl.Float64)):")
print(df_cast.dtypes)
print("-" * 30)

print(df)
print(df_cast)

df.with_columns(pl.col('age').cast(pl.Float64)):
[String, Float64, String, Float64, Int64]
------------------------------
shape: (5, 5)
┌─────────┬─────┬──────────┬───────┬─────┐
│ name    ┆ age ┆ city     ┆ score ┆ id  │
│ ---     ┆ --- ┆ ---      ┆ ---   ┆ --- │
│ str     ┆ i64 ┆ str      ┆ f64   ┆ i64 │
╞═════════╪═════╪══════════╪═══════╪═════╡
│ Alice   ┆ 25  ┆ New York ┆ 85.5  ┆ 1   │
│ Bob     ┆ 30  ┆ Paris    ┆ 72.0  ┆ 2   │
│ Charlie ┆ 35  ┆ London   ┆ 91.2  ┆ 3   │
│ David   ┆ 40  ┆ New York ┆ 88.0  ┆ 4   │
│ Eve     ┆ 45  ┆ London   ┆ 76.5  ┆ 5   │
└─────────┴─────┴──────────┴───────┴─────┘
shape: (5, 5)
┌─────────┬──────┬──────────┬───────┬─────┐
│ name    ┆ age  ┆ city     ┆ score ┆ id  │
│ ---     ┆ ---  ┆ ---      ┆ ---   ┆ --- │
│ str     ┆ f64  ┆ str      ┆ f64   ┆ i64 │
╞═════════╪══════╪══════════╪═══════╪═════╡
│ Alice   ┆ 25.0 ┆ New York ┆ 85.5  ┆ 1   │
│ Bob     ┆ 30.0 ┆ Paris    ┆ 72.0  ┆ 2   │
│ Charlie ┆ 35.0 ┆ London   ┆ 91.2  ┆ 3   │
│ David   ┆ 40.0 ┆ New Yo

In [None]:
# lazy() - Converts an eager DataFrame to a LazyFrame, allowing Polars to optimize the query plan before execution, which is great for performance on large datasets

# Example: Using lazy mode to filter and collect
lazy_df = df.lazy().filter(pl.col("age") < 40).sort("age")
result_df = lazy_df.collect()
print("Lazy filtered and sorted DataFrame:")
print(result_df)
print("-" * 30)

Lazy filtered and sorted DataFrame:
shape: (3, 5)
┌─────────┬─────┬──────────┬───────┬─────┐
│ name    ┆ age ┆ city     ┆ score ┆ id  │
│ ---     ┆ --- ┆ ---      ┆ ---   ┆ --- │
│ str     ┆ i64 ┆ str      ┆ f64   ┆ i64 │
╞═════════╪═════╪══════════╪═══════╪═════╡
│ Alice   ┆ 25  ┆ New York ┆ 85.5  ┆ 1   │
│ Bob     ┆ 30  ┆ Paris    ┆ 72.0  ┆ 2   │
│ Charlie ┆ 35  ┆ London   ┆ 91.2  ┆ 3   │
└─────────┴─────┴──────────┴───────┴─────┘
------------------------------


In [None]:
# value_counts() - Counts the occurrences of unique values in a Series/column

# Example: Count occurrences of each city
city_counts = df.select(pl.col("city").value_counts())
print("pl.col('city').value_counts():")
print(city_counts)

pl.col('city').value_counts():
shape: (3, 1)
┌────────────────┐
│ city           │
│ ---            │
│ struct[2]      │
╞════════════════╡
│ {"New York",2} │
│ {"Paris",1}    │
│ {"London",2}   │
└────────────────┘
