In [1]:
import polars as pl

In [6]:
#Read a CSV file and an Excel file into two separate dataframes
df1 = pl.read_csv("../data/data.csv")
df2 = pl.read_excel("../data/data.xlsx")

In [10]:
#View variable names and data types
print(df1.schema)

Schema({'Name': String, 'Age': Int64, 'Category': String, 'Value': Int64, 'Count': Int64})


In [12]:
#View variable names
print(df1.columns)

['Name', 'Age', 'Category', 'Value', 'Count']


In [16]:
#View first and last rows
print(df1.head())  # Default is 5
print(df1.head(10))
print(df2.tail(10))

shape: (5, 5)
┌──────────────────┬─────┬──────────┬───────┬───────┐
│ Name             ┆ Age ┆ Category ┆ Value ┆ Count │
│ ---              ┆ --- ┆ ---      ┆ ---   ┆ ---   │
│ str              ┆ i64 ┆ str      ┆ i64   ┆ i64   │
╞══════════════════╪═════╪══════════╪═══════╪═══════╡
│ Nicole Allen     ┆ 25  ┆ C        ┆ 54    ┆ 5     │
│ Anthony Martinez ┆ 54  ┆ A        ┆ 49    ┆ 8     │
│ Shawn Gilbert    ┆ 39  ┆ B        ┆ 64    ┆ 3     │
│ Eric Ellis       ┆ 35  ┆ A        ┆ 35    ┆ 4     │
│ Chelsea Flores   ┆ 24  ┆ C        ┆ 6     ┆ 3     │
└──────────────────┴─────┴──────────┴───────┴───────┘
shape: (10, 5)
┌──────────────────┬──────┬──────────┬───────┬───────┐
│ Name             ┆ Age  ┆ Category ┆ Value ┆ Count │
│ ---              ┆ ---  ┆ ---      ┆ ---   ┆ ---   │
│ str              ┆ i64  ┆ str      ┆ i64   ┆ i64   │
╞══════════════════╪══════╪══════════╪═══════╪═══════╡
│ Nicole Allen     ┆ 25   ┆ C        ┆ 54    ┆ 5     │
│ Anthony Martinez ┆ 54   ┆ A        ┆ 49    ┆ 

In [18]:
#Summary statistics
print(df1.describe())

shape: (9, 6)
┌────────────┬─────────────────┬───────────┬──────────┬───────────┬──────────┐
│ statistic  ┆ Name            ┆ Age       ┆ Category ┆ Value     ┆ Count    │
│ ---        ┆ ---             ┆ ---       ┆ ---      ┆ ---       ┆ ---      │
│ str        ┆ str             ┆ f64       ┆ str      ┆ f64       ┆ f64      │
╞════════════╪═════════════════╪═══════════╪══════════╪═══════════╪══════════╡
│ count      ┆ 100             ┆ 99.0      ┆ 99       ┆ 98.0      ┆ 99.0     │
│ null_count ┆ 0               ┆ 1.0       ┆ 1        ┆ 2.0       ┆ 1.0      │
│ mean       ┆ null            ┆ 40.474747 ┆ null     ┆ 45.530612 ┆ 5.232323 │
│ std        ┆ null            ┆ 13.965222 ┆ null     ┆ 28.023629 ┆ 2.79149  │
│ min        ┆ Alexandra Green ┆ 18.0      ┆ A        ┆ 1.0       ┆ 1.0      │
│ 25%        ┆ null            ┆ 29.0      ┆ null     ┆ 21.0      ┆ 3.0      │
│ 50%        ┆ null            ┆ 41.0      ┆ null     ┆ 44.0      ┆ 5.0      │
│ 75%        ┆ null            ┆ 53.0 

In [20]:
#Count duplicate rows
duplicate_count = df1.duplicate().shape[0]
print(duplicate_count)

AttributeError: 'DataFrame' object has no attribute 'duplicate'

In [22]:
#Display which rows have duplicates
duplicates = df1.is_duplicated()
print(duplicates)

shape: (100,)
Series: '' [bool]
[
	false
	false
	false
	false
	false
	…
	false
	false
	false
	false
	false
]


In [24]:
#View instances of the duplicated rows
duplicated_rows = df1.filter(duplicates)
print(duplicated_rows)

shape: (4, 5)
┌─────────────────┬─────┬──────────┬───────┬───────┐
│ Name            ┆ Age ┆ Category ┆ Value ┆ Count │
│ ---             ┆ --- ┆ ---      ┆ ---   ┆ ---   │
│ str             ┆ i64 ┆ str      ┆ i64   ┆ i64   │
╞═════════════════╪═════╪══════════╪═══════╪═══════╡
│ Alexandra Green ┆ 21  ┆ C        ┆ 7     ┆ 9     │
│ Alexandra Green ┆ 21  ┆ C        ┆ 7     ┆ 9     │
│ Mark Hester     ┆ 35  ┆ C        ┆ 44    ┆ 4     │
│ Mark Hester     ┆ 35  ┆ C        ┆ 44    ┆ 4     │
└─────────────────┴─────┴──────────┴───────┴───────┘


In [26]:
#Checking for duplicates using a subset of columns
duplicates_subset = df1.is_duplicated(subset=["Name", "Age"])
print(df1.filter(duplicates_subset))

TypeError: DataFrame.is_duplicated() got an unexpected keyword argument 'subset'

In [28]:
#Drop duplicate rows
df1 = df1.unique()

In [30]:
#Count missing values
print(df1.null_count())

shape: (1, 5)
┌──────┬─────┬──────────┬───────┬───────┐
│ Name ┆ Age ┆ Category ┆ Value ┆ Count │
│ ---  ┆ --- ┆ ---      ┆ ---   ┆ ---   │
│ u32  ┆ u32 ┆ u32      ┆ u32   ┆ u32   │
╞══════╪═════╪══════════╪═══════╪═══════╡
│ 0    ┆ 1   ┆ 1        ┆ 2     ┆ 1     │
└──────┴─────┴──────────┴───────┴───────┘


In [32]:
#Display rows with missing values
missing_data = df1.filter(df1.null_count().sum_horizontal() > 0)
print(missing_data)

shape: (98, 5)
┌─────────────────────┬─────┬──────────┬───────┬───────┐
│ Name                ┆ Age ┆ Category ┆ Value ┆ Count │
│ ---                 ┆ --- ┆ ---      ┆ ---   ┆ ---   │
│ str                 ┆ i64 ┆ str      ┆ i64   ┆ i64   │
╞═════════════════════╪═════╪══════════╪═══════╪═══════╡
│ Angela Oneill DDS   ┆ 64  ┆ C        ┆ 55    ┆ 3     │
│ Lisa Chen           ┆ 61  ┆ B        ┆ 41    ┆ 10    │
│ Renee Austin        ┆ 59  ┆ B        ┆ 78    ┆ 10    │
│ Jamie Knight        ┆ 52  ┆ B        ┆ 34    ┆ 10    │
│ Linda Dalton        ┆ 61  ┆ A        ┆ 61    ┆ 3     │
│ …                   ┆ …   ┆ …        ┆ …     ┆ …     │
│ Eric Myers          ┆ 30  ┆ B        ┆ 4     ┆ 2     │
│ Jacob Martinez      ┆ 22  ┆ A        ┆ 1     ┆ 9     │
│ Eric Moore          ┆ 37  ┆ C        ┆ 18    ┆ 1     │
│ Jonathan Coleman    ┆ 53  ┆ A        ┆ null  ┆ null  │
│ Stephanie Henderson ┆ 33  ┆ C        ┆ 39    ┆ 5     │
└─────────────────────┴─────┴──────────┴───────┴───────┘


In [34]:
#Drop rows with missing values
df1 = df1.drop_nulls()

In [36]:
#Dropping specific rows
df1 = df1.filter(~df1.index().is_in([3, 8]))

AttributeError: 'DataFrame' object has no attribute 'index'

In [38]:
#Adding a row
new_row = pl.DataFrame({"A": [4], "B": ["d"]})
df1 = pl.concat([df1, new_row], how="vertical")

ShapeError: unable to append to a DataFrame of width 5 with a DataFrame of width 2

In [40]:
#Mapping values with a dictionary
mapping = {"A": "Awesome", "B": "Sub-Awesome", "C": "Sub-Awesome"}
df1 = df1.with_columns(pl.col("Category").map_dict(mapping).alias("Category2"))

AttributeError: 'Expr' object has no attribute 'map_dict'