## **Dataframe**

> *A Pandas DataFrame is a two-dimensional table-like structure in Python where data is arranged in rows and columns.*

In [104]:
import pandas as pd

- **DF from list**

In [105]:
lst = ['Geeks', 'For', 'Geeks', 'is', 'portal', 'for', 'Geeks']

df = pd.DataFrame(lst)
df

Unnamed: 0,0
0,Geeks
1,For
2,Geeks
3,is
4,portal
5,for
6,Geeks


- **DF from dict / ndarray**

In [106]:
data = {'Name':['Jai', 'Princi', 'Gaurav', 'Anuj', 'Biruj'],
        'Age':[27, 24, 22, 32, 27],
        'Address':['Delhi', 'Kanpur', 'Delhi', 'Kanpur', 'Delhi'],
        'Qualification':['Msc', 'MA', 'MCA', 'Phd', 'MCA']}
 
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Address,Qualification
0,Jai,27,Delhi,Msc
1,Princi,24,Kanpur,MA
2,Gaurav,22,Delhi,MCA
3,Anuj,32,Kanpur,Phd
4,Biruj,27,Delhi,MCA


- **Acessing column**

In [107]:
req_col_filters = ['Name', 'Qualification'] 
req_cols = df[req_col_filters]
req_cols

Unnamed: 0,Name,Qualification
0,Jai,Msc
1,Princi,MA
2,Gaurav,MCA
3,Anuj,Phd
4,Biruj,MCA


- **Accessing a row**
    - *Row can be accessed by the attributes of the index column,*
        `by default, 0, 1, 2, ... is the indexed col` 

In [108]:
# set the index
add_indexed_df = df.set_index("Address")
add_indexed_df

Unnamed: 0_level_0,Name,Age,Qualification
Address,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Delhi,Jai,27,Msc
Kanpur,Princi,24,MA
Delhi,Gaurav,22,MCA
Kanpur,Anuj,32,Phd
Delhi,Biruj,27,MCA


In [109]:
peeps_from_delhi = add_indexed_df.loc["Delhi"]
peeps_from_delhi

Unnamed: 0_level_0,Name,Age,Qualification
Address,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Delhi,Jai,27,Msc
Delhi,Gaurav,22,MCA
Delhi,Biruj,27,MCA


- **Missing value checking**

In [110]:
df.isnull()

Unnamed: 0,Name,Age,Address,Qualification
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False


In [111]:
df.notnull()

Unnamed: 0,Name,Age,Address,Qualification
0,True,True,True,True
1,True,True,True,True
2,True,True,True,True
3,True,True,True,True
4,True,True,True,True


In [112]:
df.isna()

Unnamed: 0,Name,Age,Address,Qualification
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False


- **Missing data handling**

In [113]:
import numpy as np

In [114]:
dict = {'First Score':[100, 90, np.nan, 95],
        'Second Score': [30, 45, 56, np.nan],
        'Third Score':[np.nan, 40, 80, 98]}
df2 = pd.DataFrame(dict)
df2

Unnamed: 0,First Score,Second Score,Third Score
0,100.0,30.0,
1,90.0,45.0,40.0
2,,56.0,80.0
3,95.0,,98.0


In [115]:
# replace misssing with other values
df2.fillna(0)

Unnamed: 0,First Score,Second Score,Third Score
0,100.0,30.0,0.0
1,90.0,45.0,40.0
2,0.0,56.0,80.0
3,95.0,0.0,98.0


In [116]:
dict = {'First Score':[100, 90, np.nan, 95],
        'Second Score': [30, np.nan, 45, 56],
        'Third Score':[52, 40, 80, 98],
        'Fourth Score':[np.nan, np.nan, np.nan, 65]}
df3 = pd.DataFrame(dict)
df3

Unnamed: 0,First Score,Second Score,Third Score,Fourth Score
0,100.0,30.0,52,
1,90.0,,40,
2,,45.0,80,
3,95.0,56.0,98,65.0


In [126]:
# drop rows with missing value
df3.dropna()

Unnamed: 0,First Score,Second Score,Third Score,Fourth Score
3,95.0,56.0,98,65.0


- **Rows iteration**

In [118]:
for id, v in df.iterrows():
    print(v)

Name               Jai
Age                 27
Address          Delhi
Qualification      Msc
Name: 0, dtype: object
Name             Princi
Age                  24
Address          Kanpur
Qualification        MA
Name: 1, dtype: object
Name             Gaurav
Age                  22
Address           Delhi
Qualification       MCA
Name: 2, dtype: object
Name               Anuj
Age                  32
Address          Kanpur
Qualification       Phd
Name: 3, dtype: object
Name             Biruj
Age                 27
Address          Delhi
Qualification      MCA
Name: 4, dtype: object


- **Columns iteration**

In [119]:
all_cls = list(df) # all_cls = df.columns
all_cls

['Name', 'Age', 'Address', 'Qualification']

In [120]:
# printing row 1
for cl in all_cls:
    print(df[cl][1])

Princi
24
Kanpur
MA


- **Some more methods**

In [121]:
# insert new col to a df
df.insert(column="Experience", value=[4, 8, 10, 4, 1], loc=1)

In [122]:
df

Unnamed: 0,Name,Experience,Age,Address,Qualification
0,Jai,4,27,Delhi,Msc
1,Princi,8,24,Kanpur,MA
2,Gaurav,10,22,Delhi,MCA
3,Anuj,4,32,Kanpur,Phd
4,Biruj,1,27,Delhi,MCA


In [123]:
# unique items find in a column
df["Address"].unique()

array(['Delhi', 'Kanpur'], dtype=object)

In [124]:
# add constant value to all data in the df
df2.add(2)

Unnamed: 0,First Score,Second Score,Third Score
0,102.0,32.0,
1,92.0,47.0,42.0
2,,58.0,82.0
3,97.0,,100.0


In [127]:
# add constant value to all data and replace all the missing value with a conts value in the df
df2.add(1, fill_value=5)

Unnamed: 0,First Score,Second Score,Third Score
0,101.0,31.0,6.0
1,91.0,46.0,41.0
2,6.0,57.0,81.0
3,96.0,6.0,99.0


In [131]:
# sort by row
df2.sort_index(axis=1)


Unnamed: 0,First Score,Second Score,Third Score
0,100.0,30.0,
1,90.0,45.0,40.0
2,,56.0,80.0
3,95.0,,98.0
