<a href="https://colab.research.google.com/github/sks95/python_basics/blob/main/pandas_pep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import numpy as np
import pandas as pd

# Series and Data frame

Series can only contain single list with index, whereas dataframe can be made of more than one series or we can say that a dataframe is a collection of series that can be used to analyse the data.

In [6]:
A = pd.Series([1,2,3,4])
A

0    1
1    2
2    3
3    4
dtype: int64

In [10]:
A = pd.Series(['Apple','Banana','Orange','Kiwi'], index=['a','b','c','d'])
A

a     Apple
b    Banana
c    Orange
d      Kiwi
dtype: object

In [14]:
print(A.values)
print(type(A.values))

['Apple' 'Banana' 'Orange' 'Kiwi']
<class 'numpy.ndarray'>


In [16]:
print(type(A.index))
print(A.index)

<class 'pandas.core.indexes.base.Index'>
Index(['a', 'b', 'c', 'd'], dtype='object')


Slicing

Implicit - iloc [Start:end) // last element excluded // Automatic created

Explicit - loc [Start:end] // both included // Created manually

In [18]:
A

a     Apple
b    Banana
c    Orange
d      Kiwi
dtype: object

In [25]:
print(A.iloc[0:3])
print(A.loc['a':'d'])

a     Apple
b    Banana
c    Orange
dtype: object
a     Apple
b    Banana
c    Orange
d      Kiwi
dtype: object


In [28]:
C = pd.Series([1,2,3,4,5],index = [1,4,2,7,2])
C

1    1
4    2
2    3
7    4
2    5
dtype: int64

In [31]:
# C.loc[1:2]

# KeyError: 'Cannot get right slice bound for non-unique label: 2'

Creating from Dictionary

In [46]:
weightd = {'A': 60, 'B': 58, 'C':75}
heightd = {'A': 5.5, 'B': 6, 'C':6.2}
weight = pd.Series(weightd)
height = pd.Series(heightd)
weight

A    60
B    58
C    75
dtype: int64

In [35]:
weight.iloc[0:2]

A    60
B    58
dtype: int64

# DataFrame

rowindexing -> indexes

column indexing -> column

In [38]:
df = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9]])
df

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6
2,7,8,9


In [42]:
print(df.index)
print(df.columns)

RangeIndex(start=0, stop=3, step=1)
RangeIndex(start=0, stop=3, step=1)


In [49]:
# Creating Dataframes by using existing series

df = pd.DataFrame({
    'Height': height,
    'Weight': weight
})
df

Unnamed: 0,Height,Weight
A,5.5,60
B,6.0,58
C,6.2,75


In [53]:
print(df.index)
print(df.columns)
print(df['Height'])

Index(['A', 'B', 'C'], dtype='object')
Index(['Height', 'Weight'], dtype='object')
A    5.5
B    6.0
C    6.2
Name: Height, dtype: float64


In [56]:
print(df.iloc[1])
print(df.loc['B':'C'])

Height     6.0
Weight    58.0
Name: B, dtype: float64
   Height  Weight
B     6.0      58
C     6.2      75


Indexing Slicing

In [58]:
df

Unnamed: 0,Height,Weight
A,5.5,60
B,6.0,58
C,6.2,75


In [62]:
print(df['Height'])
print(df[['Height','Weight']])
print(df[[True,False,True]])

A    5.5
B    6.0
C    6.2
Name: Height, dtype: float64
   Height  Weight
A     5.5      60
B     6.0      58
C     6.2      75
   Height  Weight
A     5.5      60
C     6.2      75


In [64]:
df[df['Height']>5.5]

Unnamed: 0,Height,Weight
B,6.0,58
C,6.2,75


# Column Operations

In [84]:
df = pd.DataFrame({
    'Height': height,
    'Weight': weight
})
df

Unnamed: 0,Height,Weight
A,5.5,60
B,6.0,58
C,6.2,75


In [86]:
df['Height_CM'] = df['Height']*30.48
df

Unnamed: 0,Height,Weight,Height_CM
A,5.5,60,167.64
B,6.0,58,182.88
C,6.2,75,188.976


In [87]:
# del df['Height_CM']

df.drop(columns = ['Height_CM'],index='A',inplace = True)
df

Unnamed: 0,Height,Weight
B,6.0,58
C,6.2,75


In [88]:
df = pd.DataFrame({
    'Height': height,
    'Weight': weight
})
df

Unnamed: 0,Height,Weight
A,5.5,60
B,6.0,58
C,6.2,75


In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, A to C
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Height     3 non-null      float64
 1   Weight     3 non-null      int64  
 2   Height_CM  3 non-null      float64
dtypes: float64(2), int64(1)
memory usage: 204.0+ bytes


In [81]:
# gives top 2 elements
print(df.head(2))
print(df.tail(2))

   Height  Weight  Height_CM
A     5.5      60     167.64
B     6.0      58     182.88
   Height  Weight  Height_CM
B     6.0      58    182.880
C     6.2      75    188.976


In [80]:
df.describe()

Unnamed: 0,Height,Weight,Height_CM
count,3.0,3.0,3.0
mean,5.9,64.333333,179.832
std,0.360555,9.291573,10.98972
min,5.5,58.0,167.64
25%,5.75,59.0,175.26
50%,6.0,60.0,182.88
75%,6.1,67.5,185.928
max,6.2,75.0,188.976


In [None]:
# del df['Height_CM']

df.drop(columns = ['Height_CM'],index='A',inplace = True)