In [1]:
import numpy as np
import polars as pl

###                                                              Series
```go
        1. Unlike 'Pandas', polars can't hold Heterogeneous values in the Same Column! The datatype of ALL THE VALUES of A Column must be only ONE, either 'string' or 'integer' or 'float' bla bla, can't be mixed of [string, integer, float, bool.....].

        2. Inside polars, 'Series is a class'! So 'a = pl.Series(..)' creating a Series object and 'a' is that Series Object!
        3. Pandas have indexing in 'Series' but in polars we don't have indexing even in 'Series'. That's why, in 'pandas' you can pass 'dict' but you need to pass 1D ARRAY / LIST / SET..... in polars.Series(..) but need to be 1D must.
        4. Polars allows NaN(np.nan) values for 'float columns'. These NaN values are considered to be a type of 'floating point data' rather than 'missing data'. 
        5. You can define a `missing value` with the python `none` value.
        6. NEVER USE 'np.int8' or bla bla.. ALWAYS USE pl.Uint8 i.e. 'use datatypes from pl itself' because assume Series a has np.int8 values and Series b has np.uint8 values. When we will do `a + b`, polars need to add them and create a SINGLE DATATYPE but it will fail as it can't recognize np.int8 as pl's own datatype.

In [19]:
names = [f"Maria{i}" for i in range(8)]
marks = np.random.choice(range(1, 10), size=(8,), replace=False)
mixed = [marks[i] if i&1 else names[i] for i in range(8)]

series_names = pl.Series("Maria's Names", names) # (Name_of_the_series, values)
series_marks = pl.Series("Maria's Marks", marks)
series_mixed = pl.Series("Maria's Name and Mark", mixed, strict=False)
# hover on pl.Series and you can see going in details of what strict=False means.

print(f"{series_names}\n\n{series_marks}\n\n{series_mixed}\n\n")
print(f"{series_mixed[1]} inside \"Maria's Name and Mark\" = {type(series_mixed[1])}")

shape: (8,)
Series: 'Maria's Names' [str]
[
	"Maria0"
	"Maria1"
	"Maria2"
	"Maria3"
	"Maria4"
	"Maria5"
	"Maria6"
	"Maria7"
]

shape: (8,)
Series: 'Maria's Marks' [i32]
[
	8
	1
	9
	5
	4
	7
	2
	3
]

shape: (8,)
Series: 'Maria's Name and Mark' [str]
[
	"Maria0"
	"1"
	"Maria2"
	"5"
	"Maria4"
	"7"
	"Maria6"
	"3"
]


1 inside "Maria's Name and Mark" = <class 'str'>


In [5]:
a = pl.Series('Casting to float', [1, 23, 8.8, 2], strict=False)
print(a, '\n') # so strict=False turns ['str', int] to ['str'] and from [float, int] to [float]

shape: (4,)
Series: 'Casting to float' [f64]
[
	1.0
	23.0
	8.8
	2.0
] 



In [2]:
dictionary = {
    'Name'   : 'Maria',
    'Hobby'  : 'Kick',
    'Weight' : 80,
    'Age'    : 21.8
}

series_info = pl.Series("Maria's Info", dictionary.values(), strict=False)
print(series_info)                      #----Must be 1D----

shape: (4,)
Series: 'Maria's Info' [str]
[
	"Maria"
	"Kick"
	"80"
	"21.8"
]


#                                               Series from real world dataset
```js
        By default 'pl.read_csv(..)' reads the file in 'DataFrame' format! To convert it into a 'Series' => 'DataFrame'.to_series().
        By default polars set the FIRST COLUMN as Series when you convert the DataFrame to Series. So write the Column Name in 'columns=[..]' which you want to import from the csv file and then turn the DataFrame to Series now with your desired series. 

In [2]:
extra_info = lambda extra='': f"        --> {extra}\n-----------------------------------------------------------\n"
srs:pl.Series = pl.read_csv("D:\\datasets\\2ColumnExcel.csv", columns=['Age']).to_series()
#srs1:pl.Series = pl.read_csv("D:\\datasets\\2ColumnExcel.csv", columns=['Names']).to_series()
srs

Age
i64
25
25
27
22
29
…
20
26
24
26


In [13]:
# purely numpy indexing

print(srs[-2], extra_info('srs[-2]'))
print(srs[2:-2], extra_info('srs[2:-2]'))
print(srs[[1, 3, -1, -3]], extra_info('srs[[1, 3, -1, -3]]'))

26  --> srs[-2]
-------------------------

shape: (453,)
Series: 'Age' [i64]
[
	27
	22
	29
	29
	21
	…
	28
	26
	20
	26
	24
]  --> srs[2:-2]
-------------------------

shape: (4,)
Series: 'Age' [i64]
[
	25
	22
	26
	24
]  --> srs[[1, 3, -1, -3]]
-------------------------



#                                                        Series Attributes

In [37]:
print(srs)

shape: (457,)
Series: 'Age' [i64]
[
	25
	25
	27
	22
	29
	…
	20
	26
	24
	26
	26
]


In [55]:
print(srs.dtype, extra_info('dtype'))
print(srs.shape, extra_info('shape'))
print(srs.name,  extra_info('column name'))
print(srs.__class__, extra_info('class'))

# There's more 2-3 attributes but not that important right now.. Seems polars has functions more than attributes.

Int64  --> dtype
-------------------------

(457,)  --> shape
-------------------------

Age  --> column name
-------------------------

<class 'polars.series.series.Series'>  --> class
-------------------------



#                                                       Seried Methods

In [28]:
print(srs.is_empty(), extra_info('is_empty()'))
print(srs.has_nulls(), extra_info('has_nulls()'))
print(srs.is_unique().all(), extra_info('is_unique()')) # polar's "is_unique()" returns a Series of Boolean Values i.e. Boolean Mask. ".all" returns True if ALL THE VALUES ARE TRUE, else FALSE.
print(srs.is_sorted(), extra_info('is_sorted()'))
print(srs.is_duplicated(), extra_info('is_duplicated()')) # returns a Boolean Series / Mask, True = Duplicated i.e. Not Unique
print(srs.head(4), extra_info('head(4)'))
print(srs.tail(4), extra_info('tail(4)'))
print(srs.sample(4), extra_info('sample(4)')) # randomly picks 4 different values.
print(srs.len(), extra_info('len()')) # count EVERY VALUE including NaN values, equivalent to pandas .size
print(srs.count(), extra_info('count()')) # count only NON NAN values.

False  --> is_empty()
-------------------------

False  --> has_nulls()
-------------------------

False  --> is_unique()
-------------------------

False  --> is_sorted()
-------------------------

shape: (457,)
Series: 'Age' [bool]
[
	true
	true
	true
	true
	true
	…
	true
	true
	true
	true
	true
]  --> is_duplicated()
-------------------------

shape: (4,)
Series: 'Age' [i64]
[
	25
	25
	27
	22
]  --> head(4)
-------------------------

shape: (4,)
Series: 'Age' [i64]
[
	26
	24
	26
	26
]  --> tail(4)
-------------------------

shape: (4,)
Series: 'Age' [i64]
[
	39
	26
	34
	37
]  --> sample(4)
-------------------------

457  --> len()
-------------------------

457  --> count()
-------------------------



In [38]:
print(srs.value_counts(), extra_info('value_counts(), In Random Order')) # returns (Original Column Uniquely, Frquencies) in RANDOM ORDER i.e. NOT IN THE ORDER THE (UNIQUE) VALUES are in that column.
print(srs.value_counts(sort=True), extra_info('value_counts(sort=True), In Descending Order')) # In Descending Order
print(type(srs.value_counts()), extra_info('type(value_counts())'))

shape: (22, 2)
┌─────┬───────┐
│ Age ┆ count │
│ --- ┆ ---   │
│ i64 ┆ u32   │
╞═════╪═══════╡
│ 22  ┆ 26    │
│ 19  ┆ 2     │
│ 23  ┆ 41    │
│ 37  ┆ 4     │
│ 40  ┆ 3     │
│ …   ┆ …     │
│ 27  ┆ 41    │
│ 31  ┆ 22    │
│ 26  ┆ 36    │
│ 24  ┆ 47    │
│ 36  ┆ 10    │
└─────┴───────┘  --> value_counts(), In Random Order
-------------------------

shape: (22, 2)
┌─────┬───────┐
│ Age ┆ count │
│ --- ┆ ---   │
│ i64 ┆ u32   │
╞═════╪═══════╡
│ 24  ┆ 47    │
│ 25  ┆ 45    │
│ 27  ┆ 41    │
│ 23  ┆ 41    │
│ 26  ┆ 36    │
│ …   ┆ …     │
│ 37  ┆ 4     │
│ 38  ┆ 4     │
│ 40  ┆ 3     │
│ 39  ┆ 2     │
│ 19  ┆ 2     │
└─────┴───────┘  --> value_counts(sort=True), In Descending Order
-------------------------

<class 'polars.dataframe.frame.DataFrame'>  --> type(value_counts())
-------------------------



In [5]:
print(srs.sort(), extra_info('sort(), Ascending Order')) # in Ascending Order
print(srs.sort(descending=True, nulls_last=True), extra_info('sort(descending=True, nulls_last=True)'))

srs_copy = srs.__copy__()
srs_copy.sort(nulls_last=True, in_place=True)
print(srs_copy, extra_info('sort(nulls_last=True, in_place=True)'))

shape: (457,)
Series: 'Age' [i64]
[
	19
	19
	20
	20
	20
	…
	39
	39
	40
	40
	40
]  --> sort(), Ascending Order
-------------------------

shape: (457,)
Series: 'Age' [i64]
[
	40
	40
	40
	39
	39
	…
	20
	20
	20
	19
	19
]  --> sort(descending=True, nulls_last=True)
-------------------------

shape: (457,)
Series: 'Age' [i64]
[
	19
	19
	20
	20
	20
	…
	39
	39
	40
	40
	40
]  --> sort(nulls_last=True, in_place=True)
-------------------------



In [69]:
temp_srs = pl.Series('Random Series', [1, 5, 2, None, 4, None, 8, 5, 1], dtype=pl.UInt8)
# Nan is floating point data where None has no datatype. Since 1, 5, 2, 4, 8 are integer, this Series.dtype will be integer also.
print(temp_srs.is_null(),         extra_info("is_null()")) # returns A Boolean Series.
print(temp_srs.is_not_null(),     extra_info("is_not_null()")) # returns A Boolean Series.
print(temp_srs.drop_nulls(),      extra_info(".drop_nulls()")) # returns the Series after removing nulls.
print(temp_srs.fill_null('FAIL'), extra_info(".fill_null('FAIL)")) # returns A new Series after filling the nulls with 'FAIL'.
# In polars all the values in Series has THE SAME DATATYPE, since 'FAIL' is string, all the integers will be STRING NOW.

print(temp_srs.is_between(3, 6), extra_info(".is_between(3, 6)")) # both are INCLUSIVE.
print(temp_srs.is_in([8, 1]), extra_info("is_in([8, 1])")) # nulls are like False, so we can also do .is_in([8, 1]).sum().
print(temp_srs.clip(3, 6), extra_info("clip(3, 6)")) # same as np.clip(). nulls will be ignored to clip.
print(temp_srs.cast(pl.String), extra_info("cast(pl.String)")) # Well null will be null lol. Its same as pandas astype(..)

shape: (9,)
Series: 'Random Series' [bool]
[
	false
	false
	false
	true
	false
	true
	false
	false
	false
]         --> is_null()
-----------------------------------------------------------

shape: (9,)
Series: 'Random Series' [bool]
[
	true
	true
	true
	false
	true
	false
	true
	true
	true
]         --> is_not_null()
-----------------------------------------------------------

shape: (7,)
Series: 'Random Series' [u8]
[
	1
	5
	2
	4
	8
	5
	1
]         --> .drop_nulls()
-----------------------------------------------------------

shape: (9,)
Series: 'Random Series' [str]
[
	"1"
	"5"
	"2"
	"FAIL"
	"4"
	"FAIL"
	"8"
	"5"
	"1"
]         --> .fill_null('FAIL)
-----------------------------------------------------------

shape: (9,)
Series: 'Random Series' [bool]
[
	false
	true
	false
	null
	true
	null
	false
	true
	false
]         --> .is_between(3, 6)
-----------------------------------------------------------

shape: (9,)
Series: 'Random Series' [bool]
[
	true
	false
	false
	null
	false
	nul

In [78]:
#                       drop_dupliactes(keep='first' or 'last' or 'any') in Pandas, in polars its unique(keep=...)
#           The Series.unique() doens't have the parameter 'keep'. 'maintain_order = True' is quite expensive.
#           Keep 'maintain_order = False' which is default unless you need the Order Strictly.

temp_srs = pl.Series('Random Series', [1, 5, 2, None, 4, None, 8, 5, 1], dtype=pl.UInt8)
print(temp_srs.unique(maintain_order=True), extra_info("drop duplicates in Series doesn't have the parameter 'keep'"))

temp_DataFrame = pl.DataFrame({'numbers' : [1, 5, 2, None, 4, None, 8, 5, 1]})
print(temp_DataFrame.unique(keep='last', maintain_order=True), extra_info("drop duplicates in DataFrame has 'keep'"))

temp_LazyFrame = pl.LazyFrame({'numbers' : [1, 5, 2, None, 4, None, 8, 5, 1]})
print(temp_LazyFrame.unique(keep='last', maintain_order=True).collect(), extra_info("drop duplicates in LazyFrame has 'keep'"))

shape: (6,)
Series: 'Random Series' [u8]
[
	1
	5
	2
	null
	4
	8
]         --> drop duplicates in Series doesn't have the parameter 'keep'
-----------------------------------------------------------

shape: (6, 1)
┌─────────┐
│ numbers │
│ ---     │
│ i64     │
╞═════════╡
│ 2       │
│ 4       │
│ null    │
│ 8       │
│ 5       │
│ 1       │
└─────────┘         --> drop duplicates in DataFrame has 'keep'
-----------------------------------------------------------

shape: (6, 1)
┌─────────┐
│ numbers │
│ ---     │
│ i64     │
╞═════════╡
│ 2       │
│ 4       │
│ null    │
│ 8       │
│ 5       │
│ 1       │
└─────────┘         --> drop duplicates in LazyFrame has 'keep'
-----------------------------------------------------------



In [54]:
print(srs.sum(), extra_info('sum()'))
print(srs.cum_sum(), extra_info('cum_sum()')) # There's cum_count(), cum_max(), cum_min(), cum_prod().
print(srs.cum_max(), extra_info('cum_max()')) # The Current Max from me(Current Index) to me Above/Previous Values.
print(srs.product(), extra_info('product()'))
print(srs.max(), extra_info('max()')) # There's also min()
print(srs.arg_max(), extra_info('arg_max()')) # The First Index of Max Value. # There's also arg_min()
print(srs.mean(), extra_info('mean()')) # There's also mode(), median(), std(), var()
print(srs.describe(), extra_info('decribe()'))

12311  --> sum()
-------------------------

shape: (457,)
Series: 'Age' [i64]
[
	25
	50
	77
	99
	128
	…
	12209
	12235
	12259
	12285
	12311
]  --> cum_sum()
-------------------------

shape: (457,)
Series: 'Age' [i64]
[
	25
	25
	27
	27
	29
	…
	40
	40
	40
	40
	40
]  --> cum_max()
-------------------------

0  --> product()
-------------------------

40  --> max()
-------------------------

298  --> arg_max()
-------------------------

26.938730853391686  --> mean()
-------------------------

shape: (9, 2)
┌────────────┬───────────┐
│ statistic  ┆ value     │
│ ---        ┆ ---       │
│ str        ┆ f64       │
╞════════════╪═══════════╡
│ count      ┆ 457.0     │
│ null_count ┆ 0.0       │
│ mean       ┆ 26.938731 │
│ std        ┆ 4.404016  │
│ min        ┆ 19.0      │
│ 25%        ┆ 24.0      │
│ 50%        ┆ 26.0      │
│ 75%        ┆ 30.0      │
│ max        ┆ 40.0      │
└────────────┴───────────┘  --> decribe()
-------------------------



In [69]:
#                                               Writing data in a Series

testMark_srs:pl.Series = pl.Series([7, 2, 4, 3, 8, 4], dtype=pl.UInt8) # np.uint8 will give error, always use pl.datatype
#testMark_srs[6] = 12 # unlike pandas, polars will THROW ERROR if the index is out of bounds BECAUSE POLAR'S doesn't have any (CUSTOM) INDEX COLUMN, ITS TOTALLY 0 BASED INDEXING here.
testMark_srs.append(pl.Series([12], dtype=testMark_srs.dtype)) # 'append' appends A series, not a Value and it both RETUNS AND MODIFY.

print(testMark_srs)

shape: (7,)
Series: '' [u8]
[
	7
	2
	4
	3
	8
	4
	12
]


In [71]:
#                                               Broadcasting

print(10 - testMark_srs, extra_info('How much marks need to get 10/10')) # returns SERIES. The same way you can work with *, /, -, ** etc.
another_srs = pl.Series(np.arange(testMark_srs.len()))
print(testMark_srs + another_srs) # doesn't matter if both series has same datatype or not.

shape: (7,)
Series: '' [u8]
[
	3
	8
	6
	7
	2
	6
	254
]  --> How much marks need to get 10/10
-------------------------

shape: (7,)
Series: '' [i32]
[
	7
	3
	6
	6
	12
	9
	18
]


In [7]:
#                                                    Graph
