In [None]:
## READING SPSS FILES
import pyreadstat
df, meta = pyreadstat.read_sav(filename, row_limit=10)
# print codebook
print(meta.variable_value_labels)
# you can then read specific variables with usecols=["var1","var2"]
# you can disable the conversion to categorical with formats_as_category=False. It's much faster but then you'll have -9 for missing values (and others). Be careful using it.

In [None]:
## WORKING WITH POLARS instead of pandas (10x speed-up)
import polars as pl
# Read a CSV file (arguments similar to pandas but with different names, e.g. n_rows, delimiter)
csv_df = pl.read_csv('filename.csv')
# Read a Feather file
feather_df = pl.read_feather('filename.feather')

# Converting from a Pandas file (e.g. after reading it with pyreadstat)
pdf = pl.from_pandas(df)
# you can also convert back to pandas
pandas_df = polars_df.to_pandas()

## Select variables
#In pandas: selected_df_pandas = df[["var1", "var2"]]
subset_pdf = pdf.select(["var1", "var2"])

## Select rows
#In pandas: subset_df_pandas = df.loc[df["var1"] == 3]
subset_df_polars = pdf.filter(pdf["var1"] == 3)

## Merge
#In pandas (e.g. left join): merged_df_pandas = pd.merge(df1, df2, on=["var1", "var2"], how="left")
merged_pdf_polars = pdf1.join(pdf2, on=["var1", "var2"], how='left')


## Lazy read to filter
pdf = (
pl.scan_csv("my_long_file.csv")  # lazy, doesn't do a thing
    .select(
        ["a", "c"]
    )  # select only 2 columns (other columns will not be read)
    .filter(
        pl.col("a") > 10
    )  # the filter is pushed down the scan, so less data is read into memory
    .collect()  # collect the data
)