# Topic: Data Manupulation - with pandas

## Pandas -series object:

In [3]:
import pandas as pd
import numpy as np

In [4]:
pd.Series()

Series([], dtype: object)

In [5]:
pd.Series([10,20,30,40,50])

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [6]:
pd.Series([10,20,30,40,50], dtype='float')

0    10.0
1    20.0
2    30.0
3    40.0
4    50.0
dtype: float64

In [7]:
ice_creams = pd.Series(['Vanilla', 'Chocolate', 'Strawberry', 'Butterscotch'])
print(ice_creams)

0         Vanilla
1       Chocolate
2      Strawberry
3    Butterscotch
dtype: object


In [11]:
# generate a pandas series with random numbers.

pd.Series(np.random.randint(1,101, size=10))


0    41
1    17
2     8
3     6
4    24
5    34
6    98
7    82
8    70
9    39
dtype: int64

### ATTRIBUTES / PROPERTIES of a Series object:

In [14]:
fav_ic=pd.Series(['Vanilla', 'Chocolate', 'Strawberry', 'Butterscotch'])

In [16]:
fav_ic.values

array(['Vanilla', 'Chocolate', 'Strawberry', 'Butterscotch'], dtype=object)

In [18]:
fav_ic.index

RangeIndex(start=0, stop=4, step=1)

In [20]:
fav_ic.dtype        # 0 means Object data type (string)

dtype('O')

In [22]:
fav_ic.size      # number of elements in the Series

4

In [24]:
fav_ic.shape     # 4 rows, 1 column r * c = 4 * 1 = 4

(4,)

In [28]:
nums = pd.Series([10,20,30,40,50])
nums.is_unique

True

In [None]:
nums.is_monotonic_increasing  # values are in increasing order, ascending

True

In [31]:
nums.is_monotonic_decreasing  # values are in decreasing order, descending.

False

In [None]:
num_series = pd.Series([10,20,30,40,50,60,70,80,90,100])

num_series.sum()
num_series.mean()
num_series.median()
num_series.std()
num_series.min()
num_series.max()
num_series.var()
num_series.describe()
num_series.head(3)   # first 3 values
num_series.tail(4)   # last 4 values
num_series.sample(5)   # random 5 values
num_series.sort_values(ascending=False)   # sort in descending order
num_series.sort_values(ascending=True)    # sort in ascending order
num_series.nlargest(3)   # top 3 largest values
num_series.nsmallest(4)  # top 4 smallest values
num_series.unique()   # unique values in the series
num_series.value_counts()   # count of each unique value in the series
num_series.apply(lambda x: x*2)   # multiply each value by 2
num_series.map(lambda x: x+5)   # add 5 to each value
num_series.replace(10, 999)   # replace 10 with 999
num_series.between(30, 70)   # check which values are between 30
num_series.clip(lower=30, upper=70)   # clip values below 30 to 30 and above 70 to 70
num_series.isnull()   # check for null values
num_series.notnull()   # check for non-null values
num_series.duplicated()   # check for duplicate values
num_series.drop_duplicates()   # drop duplicate values
num_series.reset_index(drop=True)   # reset index
num_series.astype('float')   # convert series to float type
num_series.astype('str')   # convert series to string type
num_series.tolist()   # convert series to list
num_series.to_numpy()   # convert series to numpy array
num_series.to_dict()   # convert series to dictionary



{0: 10, 1: 20, 2: 30, 3: 40, 4: 50, 5: 60, 6: 70, 7: 80, 8: 90, 9: 100}

In [37]:
num_series

0     10
1     20
2     30
3     40
4     50
5     60
6     70
7     80
8     90
9    100
dtype: int64

In [None]:
num_series.head(n=6)  # first 6 values

0    10
1    20
2    30
3    40
4    50
5    60
dtype: int64

In [39]:
num_series.tail(n=3)  # last 3 values

7     80
8     90
9    100
dtype: int64

In [41]:
nums = pd.Series([1,2,3,np.nan,4,5])  # series with a NaN value
nums.count()  # count of non-null values, does not count NaN

np.int64(5)

#### Here in general and default. nums.count() only counts non null values. but we can change it function to count all values including Null values too.

In [42]:
nums.count()

np.int64(5)

In [43]:
nums.sum()

np.float64(15.0)

In [44]:
nums.sum(skipna=False)  # sum including NaN, will return NaN    

np.float64(nan)

In [46]:
nums.sum(min_count=3)  # returns Nan if less than 3 non-NA values

np.float64(15.0)

In [47]:
nums.sum(min_count=5)

np.float64(15.0)

In [50]:
print(nums.sum(min_count=6)) #return naN if non-NA values are less than 6.

nan


In [52]:
nums.product() # product of non-null values

np.float64(120.0)

In [None]:
nums.product(skipna=False)    # product including NaN, will return NaN

np.float64(nan)

In [54]:
nums.product(min_count=3)

np.float64(120.0)

In [55]:
nums.product(min_count=5)   # returns NaN if less than 4 non-NA values

np.float64(120.0)

In [56]:
nums.product(min_count=6)

np.float64(nan)

In [57]:
nums.cumsum()  # cumulative sum

0     1.0
1     3.0
2     6.0
3     NaN
4    10.0
5    15.0
dtype: float64

In [61]:
nums.cumsum(skipna=False)  # cumulative sum including NaN, min_count does not apply here

0    1.0
1    3.0
2    6.0
3    NaN
4    NaN
5    NaN
dtype: float64

In [62]:
nums.pct_change() * 100

  nums.pct_change() * 100


0           NaN
1    100.000000
2     50.000000
3      0.000000
4     33.333333
5     25.000000
dtype: float64

### Forward fill: (pad)
ex: 1,2,3,np.nan,np.nan,np.nan,4,5,6
here we have 3 missing values. if we use ffill method:
result will be L 1,2,3,3,3,3,4,5,6 (here missing values is replaced by the before value of missing value.)

### Backward fill: (backfill)
ex: 1,2,3,np.nan,np.nan,np.nan,4,5,6
here we have 3 missing values. if we use bfill method:
result will be L 1,2,3,4,4,4,4,5,6 (here missing values is replaced by the after value of missing value.)

In [66]:
random_nums = pd.Series(np.random.randint(1,21, size=10))
random_nums

0     6
1     4
2     7
3    20
4     6
5    19
6     1
7    15
8    13
9    13
dtype: int64

In [67]:
# it generated 10 random integers between 1 and 20 , with duplicates.

In [68]:
random_nums.mean()

np.float64(10.4)

In [69]:
random_nums.median()

np.float64(10.0)

In [70]:
random_nums.min()

np.int64(1)

In [71]:
random_nums.max()

np.int64(20)

In [73]:
random_nums.unique()

array([ 6,  4,  7, 20, 19,  1, 15, 13])

In [74]:
random_nums.nunique()

8

In [None]:
nums = pd.Series([1,2,3,4,5])
nums + 10   # or nums.add(10)
nums.sub(3)
nums.mul(2)
nums.div(2)
nums.floordiv(2)
nums.mod(4)
