# INTRODUCTION TO PANDAS

- *Pandas* is a Python third-party library to work with tabular data for data analysis
- Installing pandas, along with numpy:

```
pip install numpy pandas
```

- To use *pandas* we need to import it:
    - *import pandas as pd*

#### In pandas, we work with two types of objects:
    - Series    - 1D (column)
    - DataFrame - 2D (rows and columns)

In [1]:
import numpy as np
import pandas as pd

## The Series Object

In [2]:
# Creating an empty series object
pd.Series()

Series([], dtype: object)

In [3]:
ice_creams = ["chocolate", "vanilla", "strawberry", "butterscotch", "mint"]
pd.Series(ice_creams)

0       chocolate
1         vanilla
2      strawberry
3    butterscotch
4            mint
dtype: object

In [4]:
pd.Series(data=ice_creams)

0       chocolate
1         vanilla
2      strawberry
3    butterscotch
4            mint
dtype: object

In [5]:
ice_creams = ["chocolate", "vanilla", "strawberry", "butterscotch", "mint"]
weekdays = ["Mon", "Tue", "Wed", "Thu", "Fri"]
pd.Series(data=ice_creams, index=weekdays)

Mon       chocolate
Tue         vanilla
Wed      strawberry
Thu    butterscotch
Fri            mint
dtype: object

#### Examples of Series'

In [6]:
scores = [99, 85, 92, 88, 76]
pd.Series(scores)               # pd.Series(data=scores)

0    99
1    85
2    92
3    88
4    76
dtype: int64

In [7]:
bool_values = [True, False, True, True, False]
pd.Series(bool_values)         # pd.Series(data=bool_values)

0     True
1    False
2     True
3     True
4    False
dtype: bool

In [8]:
temps = [36.5, 37.0, 38.2, 39.1, 36.8]
pd.Series(temps)               # pd.Series(data=temps)

0    36.5
1    37.0
2    38.2
3    39.1
4    36.8
dtype: float64

In [9]:
cities = ["Hyd", "Blr", "Che", "Tri", "Amr"]
pd.Series(data=cities)

0    Hyd
1    Blr
2    Che
3    Tri
4    Amr
dtype: object

##### Creating a Series with missing values

In [10]:
temperatures = [36.5, 37.0, 38.2, np.nan, 36.8] # np.nan represents missing value
pd.Series(temperatures)

0    36.5
1    37.0
2    38.2
3     NaN
4    36.8
dtype: float64

#### Creating a Series from Python dictionary

In [11]:
# dictionary
students_pers = {
    "Manu": 85,
    "Ravi": 89,
    "Sujitha": 88,
    "Suryani": 90,
    "Anil": 87
}
pd.Series(students_pers)

Manu       85
Ravi       89
Sujitha    88
Suryani    90
Anil       87
dtype: int64

#### Creating series from a tuple

In [12]:
sales = (100, 200, 150, 300, 250)
pd.Series(sales)

0    100
1    200
2    150
3    300
4    250
dtype: int64

#### Creating a Series from a Set
- Throws an error
- Convert the set to list/tuple

In [13]:
my_set = {"January", "February", "March", "April", "May"}
# pd.Series(my_set)       # Throws an error because sets are unordered
pd.Series(list(my_set))  # Converting set to list before creating Series

0       March
1         May
2     January
3       April
4    February
dtype: object

#### A series with random numbers

In [14]:
random_data = np.random.randint(1, 100, 10)
pd.Series(random_data)

0    70
1    54
2    70
3    87
4    32
5    78
6    35
7    81
8    45
9    62
dtype: int32

### Series Attribute

In [15]:
ice_creams = ["chocolate", "vanilla", "strawberry", "butterscotch", "mint"]
weekdays = ["Mon", "Tue", "Wed", "Thu", "Fri"]
diet = pd.Series(data=ice_creams, index=weekdays)
diet

Mon       chocolate
Tue         vanilla
Wed      strawberry
Thu    butterscotch
Fri            mint
dtype: object

In [16]:
# values attribute
diet.values

array(['chocolate', 'vanilla', 'strawberry', 'butterscotch', 'mint'],
      dtype=object)

In [17]:
type(diet.values)

numpy.ndarray

In [18]:
# index attribute
diet.index

Index(['Mon', 'Tue', 'Wed', 'Thu', 'Fri'], dtype='object')

In [19]:
# dtype for data type of the series
diet.dtype

dtype('O')

In [20]:
print(diet.dtype)

object


In [21]:
# size attribute for number of elements in the series
diet.size

5

In [22]:
# shape attribute for dimensions of the series
diet.shape

(5,)

In [23]:
diet.values

array(['chocolate', 'vanilla', 'strawberry', 'butterscotch', 'mint'],
      dtype=object)

In [24]:
# is_unique attribute to check if all values are unique
diet.is_unique

True

In [25]:
# is_unique attribute to check if all values are unique
x = pd.Series([1, 2, 3, 4, 5, 1])
x.is_unique

False

In [26]:
mt = pd.Series([10, 20, 30, 40, 50])
# is_monotonic attribute to check if values are in increasing order
mt.is_monotonic_increasing

True

In [27]:
mt.is_monotonic_decreasing

False

### Retrieving the First and Last Rows

In [28]:
values = range(0, 500, 5)
nums = pd.Series(values)
nums

0       0
1       5
2      10
3      15
4      20
     ... 
95    475
96    480
97    485
98    490
99    495
Length: 100, dtype: int64

In [29]:
# head() / head(n) / head(n=5) method to get first n elements
nums.head()     # first 5 elements by default

0     0
1     5
2    10
3    15
4    20
dtype: int64

In [30]:
nums.head(12)   # first 12 elements

0      0
1      5
2     10
3     15
4     20
5     25
6     30
7     35
8     40
9     45
10    50
11    55
dtype: int64

In [31]:
nums.head(n=3)  # first 3 elements

0     0
1     5
2    10
dtype: int64

In [32]:
# tail() / tail(n) / tail(n=5) method to get last n elements
nums.tail()     # last 5 elements by default

95    475
96    480
97    485
98    490
99    495
dtype: int64

In [33]:
nums.tail(n=4)  # last 4 rows

96    480
97    485
98    490
99    495
dtype: int64

In [34]:
nums.tail(7)    # last 7 elements

93    465
94    470
95    475
96    480
97    485
98    490
99    495
dtype: int64

## Mathematical Operations

### Statistical Operations

In [35]:
numbers = pd.Series([1, 2, 3, np.nan, 4, 5])
numbers

0    1.0
1    2.0
2    3.0
3    NaN
4    4.0
5    5.0
dtype: float64

In [36]:
print(numbers.count())  # count of non-missing values

5


In [37]:
numbers.sum()    # sum of all elements

np.float64(15.0)

In [38]:
numbers.sum(skipna=False)

np.float64(nan)

In [39]:
numbers

0    1.0
1    2.0
2    3.0
3    NaN
4    4.0
5    5.0
dtype: float64

In [40]:
numbers.sum(min_count=3)    # sum only if at least 3 non-missing values are present

np.float64(15.0)

In [41]:
numbers.sum(min_count=6)  # returns NaN since not all values are present

np.float64(nan)

In [42]:
numbers

0    1.0
1    2.0
2    3.0
3    NaN
4    4.0
5    5.0
dtype: float64

In [43]:
numbers.product()  # product of all elements

np.float64(120.0)

In [44]:
numbers.product(skipna=False)

np.float64(nan)

In [45]:
numbers.product(min_count=3)

np.float64(120.0)

In [46]:
numbers

0    1.0
1    2.0
2    3.0
3    NaN
4    4.0
5    5.0
dtype: float64

In [47]:
# cumsum()  Cumulative sum of elements
numbers.cumsum()

0     1.0
1     3.0
2     6.0
3     NaN
4    10.0
5    15.0
dtype: float64

In [48]:
numbers.cumsum(skipna=False)

0    1.0
1    3.0
2    6.0
3    NaN
4    NaN
5    NaN
dtype: float64

In [49]:
numbers

0    1.0
1    2.0
2    3.0
3    NaN
4    4.0
5    5.0
dtype: float64

In [50]:
# pct_change() Percentage change between the current and a prior element
numbers.pct_change()

  numbers.pct_change()


0         NaN
1    1.000000
2    0.500000
3    0.000000
4    0.333333
5    0.250000
dtype: float64

In [51]:
# forward fill
# [1, 2, 3, NaN, NaN 4, 5, NaN, 6] -> [1, 2, 3, 3, 3, 4, 5, 5, 6]
# ----- IGNORE ---
# backward fill
# [1, 2, 3, NaN, NaN 4, 5, NaN, 6] -> [1, 2, 3, 4, 4, 4, 5, 6, 6]

In [52]:
# The following lines below are equivalent: they are forward fill statements
numbers.pct_change()
numbers.pct_change(fill_method="pad")
numbers.pct_change(fill_method="ffill")

  numbers.pct_change()
  numbers.pct_change(fill_method="pad")
  numbers.pct_change(fill_method="ffill")


0         NaN
1    1.000000
2    0.500000
3    0.000000
4    0.333333
5    0.250000
dtype: float64

In [53]:
# The following lines below are equivalent: they are backward fill 
numbers.pct_change(fill_method="bfill")
numbers.pct_change(fill_method="backfill")

  numbers.pct_change(fill_method="bfill")
  numbers.pct_change(fill_method="backfill")


0         NaN
1    1.000000
2    0.500000
3    0.333333
4    0.000000
5    0.250000
dtype: float64

In [54]:
# mean() Average of all elements
numbers.mean()

np.float64(3.0)

In [55]:
# median() Median of all elements
numbers.median()

np.float64(3.0)

In [56]:
# std() Standard deviation of all elements
numbers.std()

np.float64(1.5811388300841898)

In [57]:
numbers.max()

np.float64(5.0)

In [58]:
numbers.min()

np.float64(1.0)

In [59]:
numbers.describe()  # Summary statistics of the series

count    5.000000
mean     3.000000
std      1.581139
min      1.000000
25%      2.000000
50%      3.000000
75%      4.000000
max      5.000000
dtype: float64

In [60]:
# sample() Random sample of items from an axis of object
numbers.sample(3)   # Returns 3 random elements from the series

3    NaN
5    5.0
1    2.0
dtype: float64

In [61]:
books = pd.Series(["Attitude", "Caring", "Helping", "Learning", "Caring"])

In [62]:
# unique() Returns unique values in the series
books.unique()

array(['Attitude', 'Caring', 'Helping', 'Learning'], dtype=object)

In [63]:
# nunique() Returns number of unique values in the series
books.nunique()

4

### Arithmetic Operations

In [64]:
s1 = pd.Series(data=[5, np.nan, 15], index=["A", "B", "C"])
s1

A     5.0
B     NaN
C    15.0
dtype: float64

In [65]:
s1 + 3

A     8.0
B     NaN
C    18.0
dtype: float64

In [66]:
s1.add(3)

A     8.0
B     NaN
C    18.0
dtype: float64

In [67]:
# The three lines below are equivalent
s1 - 5
s1.sub(5)
s1.subtract(5)

A     0.0
B     NaN
C    10.0
dtype: float64

In [68]:
# The three lines below are equivalent
s1 * 3
s1.mul(3)
s1.multiply(3)

A    15.0
B     NaN
C    45.0
dtype: float64

In [69]:
# The Three lines below are equivalent
s1 / 2
s1.div(2)
s1.divide(2)

A    2.5
B    NaN
C    7.5
dtype: float64

In [70]:
# The two lines below are equivalent
s1 // 2
s1.floordiv(2)

A    2.0
B    NaN
C    7.0
dtype: float64

In [71]:
# The two lines below are equivalent
s1 % 3
s1.mod(3)

A    2.0
B    NaN
C    0.0
dtype: float64

### Broadcasting

In [74]:
s1 = pd.Series(data=[1, 2, 3], index=["A", "B", "C"])
s2 = pd.Series(data=[4, 5, 6], index=["B", "A", "C"])
s1 + s2

A    6
B    6
C    9
dtype: int64

In [75]:
s1 = pd.Series(data=[1, 2, np.nan, 3], index=["A", "B", "C", "D"])
s2 = pd.Series(data=[1, 5, np.nan, 3], index=["A", "B", "C", "D"])
s1 == s2

A     True
B    False
C    False
D     True
dtype: bool

In [76]:
s1 = pd.Series(data=[1, 2, 3], index=["A", "B", "C"])
s2 = pd.Series(data=[4, 5, 6, 7], index=["B", "C", "D", "E"])
s1 + s2

A    NaN
B    6.0
C    8.0
D    NaN
E    NaN
dtype: float64

### Functions of a Series object

In [79]:
cities = pd.Series(["Hyd", "Blr", "Che", "Tri", "Amr", np.nan])

In [80]:
len(cities)

6

In [82]:
print(dir(cities))

['T', '_AXIS_LEN', '_AXIS_ORDERS', '_AXIS_TO_AXIS_NUMBER', '_HANDLED_TYPES', '__abs__', '__add__', '__and__', '__annotations__', '__array__', '__array_priority__', '__array_ufunc__', '__bool__', '__class__', '__column_consortium_standard__', '__contains__', '__copy__', '__deepcopy__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__divmod__', '__doc__', '__eq__', '__finalize__', '__firstlineno__', '__float__', '__floordiv__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__iadd__', '__iand__', '__ifloordiv__', '__imod__', '__imul__', '__init__', '__init_subclass__', '__int__', '__invert__', '__ior__', '__ipow__', '__isub__', '__iter__', '__itruediv__', '__ixor__', '__le__', '__len__', '__lt__', '__matmul__', '__mod__', '__module__', '__mul__', '__ne__', '__neg__', '__new__', '__nonzero__', '__or__', '__pandas_priority__', '__pos__', '__pow__', '__radd__', '__rand__', '__rdivmod__', '__reduce__', '__reduce_ex__',