# Introducing Pandas

## Installlation


In [None]:
!pip install pandas

## Pandas Data Structure
### 1. Series
- Series is a one-dimensional labeled array capable of holding any data type (integers, strings, floating point numbers, Python objects, etc.). 
- The axis labels are collectively referred to as the index. 

### Series Creation
- using numpy array 
- dictionary 
- a scalar value (index must be provided)

In [1]:
import pandas as pd
import numpy as np

In [2]:
a = np.array([1, 2, 3, 4, 5])
s = pd.Series(data=a, index=["a", "b", "c", "d", "e"])
s

a    1
b    2
c    3
d    4
e    5
dtype: int32

In [6]:
s1 = pd.Series(data=a)
s1

0    1
1    2
2    3
3    4
4    5
dtype: int32

In [7]:
d = {'P0':0.1, 'P1':0.2, 'P3':0.7}
s = pd.Series(d)
s

P0    0.1
P1    0.2
P3    0.7
dtype: float64

In [9]:
labels = ["a", "b", "c", "d"]
labels = [i for i in range(10)]
print(labels)
pd.Series(0, index=labels)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
dtype: int64

### Series are similar to numpy array
- accessing elements
- slicing 
- series comparison > < == etc
- operations, np.exp()
- dtype

In [13]:
print(s)
s[2]

P0    0.1
P1    0.2
P3    0.7
dtype: float64


0.7

In [21]:
ans = s >= 0.1
print(ans)
print(ans.dtype)

P0    True
P1    True
P3    True
dtype: bool
bool


In [22]:
s ** 2

P0    0.01
P1    0.04
P3    0.49
dtype: float64

### Series are dict-like
- accessing elements through index labels
- get()
- updating values

In [12]:
s['b']
s.get('k')

In [14]:
s['a'] = 10
s

a    10
b     2
c     3
d     4
e     5
dtype: int32

### Getting a numpy array from a Series
- to_numpy()


In [24]:
arr = s.to_numpy()
print(type(arr), arr)
s.values

<class 'numpy.ndarray'> [0.1 0.2 0.7]


array([0.1, 0.2, 0.7])

### Vectorized Operations 
- s+s, s*s
- Series automatically align the data based on label
    - ```s[1:] + s[:-1]```

In [3]:
s + s

a     2
b     4
c     6
d     8
e    10
dtype: int32

In [6]:
a = s[:-1] # 4
s + a

a    2.0
b    4.0
c    6.0
d    8.0
e    NaN
dtype: float64

## DataFrame
- DataFrame is a 2-dimensional labeled data structure with columns of potentially different types.
- it's like a spreadsheet or SQL table, or a dict of Series objects. 
- Most commonly used pandas object

### DataFrame Creation using
- Dict of 1D ndarrays, lists, dicts, or Series
- 2-D numpy.ndarray
- A Series
- Another DataFrame

In [15]:
a = np.arange(0, 20).reshape(5,-1)
print(a)

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 [12 13 14 15]
 [16 17 18 19]]


In [16]:
df = pd.DataFrame(a, columns=["A", "B", "C", "D"])
df

Unnamed: 0,A,B,C,D
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [19]:
# from a dictionary
d = {
    'A': np.arange(0,5),
    'B': np.arange(5, 10),
    'C': np.arange(11, 16)
}
df1 = pd.DataFrame(d)
df1

Unnamed: 0,A,B,C
0,0,5,11
1,1,6,12
2,2,7,13
3,3,8,14
4,4,9,15


In [21]:
pd.DataFrame(s, columns=["A"])

Unnamed: 0,A
a,10
b,2
c,3
d,4
e,5


### Attributes
- df.index
- df.columns
- df.dtypes
- df.info
- df.values


In [25]:
df.index

RangeIndex(start=0, stop=5, step=1)

In [26]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [28]:
df.dtypes

A    int32
B    int32
C    int32
D    int32
dtype: object

In [31]:
df['A'][0] = "str" # not used often 
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['A'][0] = "str"


Unnamed: 0,A,B,C,D
0,str,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [32]:
df['A'][0][1:]

'tr'

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   A       5 non-null      object
 1   B       5 non-null      int32 
 2   C       5 non-null      int32 
 3   D       5 non-null      int32 
dtypes: int32(3), object(1)
memory usage: 228.0+ bytes


In [36]:
df['A'][0] = 0
df.values # numpy array

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['A'][0] = 0


array([[0, 1, 2, 3],
       [4, 5, 6, 7],
       [8, 9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19]], dtype=object)

### Column Selection, Addition, Deletion
```
- df[column_name]
- df[new_column_name] = df[old_column] * 2
- del df[column_to_be_deleted], df.pop(column_name)
```

In [40]:
print(type(df['B']))
df['A']

<class 'pandas.core.series.Series'>


0     0
1     4
2     8
3    12
4    16
Name: A, dtype: object

In [49]:
df["E"] = df['B']*2 # add a new column
df["F"] = df["A"]*3

In [43]:
del df['E']
df

Unnamed: 0,A,B,C,D
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [46]:
column_e = df.pop('E')

0     2
1    10
2    18
3    26
4    34
Name: E, dtype: int32

In [47]:
df

Unnamed: 0,A,B,C,D
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [52]:
df

Unnamed: 0,A,B,C,D,E,F
0,0,1,2,3,2,0
1,4,5,6,7,10,12
2,8,9,10,11,18,24
3,12,13,14,15,26,36
4,16,17,18,19,34,48


In [54]:
newdf = df.drop(["E", "B"], axis=1)
newdf

Unnamed: 0,A,C,D,F
0,0,2,3,0
1,4,6,7,12
2,8,10,11,24
3,12,14,15,36
4,16,18,19,48


In [56]:
newdf = df.drop([0, 4], axis=0)
newdf

Unnamed: 0,A,B,C,D,E,F
1,4,5,6,7,10,12
2,8,9,10,11,18,24
3,12,13,14,15,26,36


In [62]:
newdf.index = [str(key) for key in newdf.index]
newdf.index

Index(['1', '2', '3'], dtype='object')

### Inserting a column
- a scalar value
- a series not having same index
- df.insert() for inserting a column at a particular location 

In [66]:
df['G'] = df['A']
df

Unnamed: 0,A,B,C,D,E,F,G
0,0,1,2,3,2,0,0
1,4,5,6,7,10,12,4
2,8,9,10,11,18,24,8
3,12,13,14,15,26,36,12
4,16,17,18,19,34,48,16


In [68]:
s = pd.Series(np.arange(0,5), index=[0, 'b', 1, 'd', 'e'])

In [69]:
df['G'] = s
df

Unnamed: 0,A,B,C,D,E,F,G
0,0,1,2,3,2,0,0.0
1,4,5,6,7,10,12,2.0
2,8,9,10,11,18,24,
3,12,13,14,15,26,36,
4,16,17,18,19,34,48,


In [71]:
df.insert(1, "A2", np.arange(5, 10))
df

Unnamed: 0,A,A2,B,C,D,E,F,G
0,0,5,1,2,3,2,0,0.0
1,4,6,5,6,7,10,12,2.0
2,8,7,9,10,11,18,24,
3,12,8,13,14,15,26,36,
4,16,9,17,18,19,34,48,


### Reading a csv file
- pd.read_csv
- df.head()
- df.tail()
- saving a dataframe: df.to_csv

In [75]:
# save a dataframe
df.to_csv("./data.csv")

In [77]:
# to read a csv file
data = pd.read_csv("data.csv", index_col=0)
data

Unnamed: 0,A,A2,B,C,D,E,F,G
0,0,5,1,2,3,2,0,0.0
1,4,6,5,6,7,10,12,2.0
2,8,7,9,10,11,18,24,
3,12,8,13,14,15,26,36,
4,16,9,17,18,19,34,48,


In [78]:
iris = pd.read_csv("iris_data.csv")
iris

Unnamed: 0,sepalLength,sepalWidth,petalLength,petalWidth,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [80]:
iris.head(10) 

Unnamed: 0,sepalLength,sepalWidth,petalLength,petalWidth,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
5,5.4,3.9,1.7,0.4,0
6,4.6,3.4,1.4,0.3,0
7,5.0,3.4,1.5,0.2,0
8,4.4,2.9,1.4,0.2,0
9,4.9,3.1,1.5,0.1,0


In [81]:
iris.tail()

Unnamed: 0,sepalLength,sepalWidth,petalLength,petalWidth,species
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2
149,5.9,3.0,5.1,1.8,2


## Indexing
- df.iloc - integer based indexing
- df.loc - Access a group of rows and columns by label(s) or a boolean array.
- df.where - Replace values where the condition is False.

In [88]:
iris.iloc[45, :]

sepalLength    4.8
sepalWidth     3.0
petalLength    1.4
petalWidth     0.3
species        0.0
Name: 45, dtype: float64

In [116]:
iris.iloc[20:30, :] # all the columns
iris.iloc[20:30, 0] # 0th column
iris.iloc[20:30, 0:3] # 0 to 3 column
iris.iloc[20:30, [0,1,-1]] # 0 to 3 column


Unnamed: 0,sepalLength,sepalWidth,species
20,5.4,3.4,0
21,5.1,3.7,0
22,4.6,3.6,0
23,5.1,3.3,0
24,4.8,3.4,0
25,5.0,3.0,0
26,5.0,3.4,0
27,5.2,3.5,0
28,5.2,3.4,0
29,4.7,3.2,0


In [105]:
temp = (iris['sepalLength'] > 5) & (iris['sepalWidth']>=4)
print(temp.shape)

newdf = iris.loc[temp] # boolean array based indexing in case of loc
newdf

(150,)


Unnamed: 0,sepalLength,sepalWidth,petalLength,petalWidth,species
14,5.8,4.0,1.2,0.2,0
15,5.7,4.4,1.5,0.4,0
32,5.2,4.1,1.5,0.1,0
33,5.5,4.2,1.4,0.2,0


In [110]:
iris.loc[14:20, "sepalLength":"petalLength"]
# iris[14:20, 0:2] # index will give error

Unnamed: 0,sepalLength,sepalWidth,petalLength
14,5.8,4.0,1.2
15,5.7,4.4,1.5
16,5.4,3.9,1.3
17,5.1,3.5,1.4
18,5.7,3.8,1.7
19,5.1,3.8,1.5
20,5.4,3.4,1.7


In [111]:
iris.loc[14:20, ["sepalLength","petalLength"]]

Unnamed: 0,sepalLength,petalLength
14,5.8,1.2
15,5.7,1.5
16,5.4,1.3
17,5.1,1.4
18,5.7,1.7
19,5.1,1.5
20,5.4,1.7


In [118]:
df


Unnamed: 0,A,A2,B,C,D,E,F,G
0,0,5,1,2,3,2,0,0.0
1,4,6,5,6,7,10,12,2.0
2,8,7,9,10,11,18,24,
3,12,8,13,14,15,26,36,
4,16,9,17,18,19,34,48,


In [131]:
newdf = df.where(df>5, 100) 
newdf

Unnamed: 0,A,A2,B,C,D,E,F,G
0,100,100,100,100,100,100,100,100.0
1,100,6,100,6,7,10,12,100.0
2,8,7,9,10,11,18,24,100.0
3,12,8,13,14,15,26,36,100.0
4,16,9,17,18,19,34,48,100.0


In [130]:
# df.isna()
df.fillna(0)

Unnamed: 0,A,A2,B,C,D,E,F,G
0,0,5,1,2,3,2,0,0.0
1,4,6,5,6,7,10,12,2.0
2,8,7,9,10,11,18,24,0.0
3,12,8,13,14,15,26,36,0.0
4,16,9,17,18,19,34,48,0.0


In [134]:
## Error in video code
ig = pd.Series(np.arange(1, 5), index=[0,1, 2, 3])
ig

0    1
1    2
2    3
3    4
dtype: int32

In [135]:
s2 = pd.Series(np.arange(1, 5), index=[5,6, 7, 8])
s2

5    1
6    2
7    3
8    4
dtype: int32

In [136]:
ig > s2

ValueError: Can only compare identically-labeled Series objects