In [None]:
# Creating a DataFrame

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = {
    'Name': ['John', 'Anna', 'Peter', 'Linda'],
    'Age': [28, 34, 29, 42],
    'City': ['New York', 'Paris', 'Berlin', 'London'],
    'Salary': [65000, 70000, 62000, 85000]
}
df = pd.DataFrame(data)

In [3]:
df

Unnamed: 0,Name,Age,City,Salary
0,John,28,New York,65000
1,Anna,34,Paris,70000
2,Peter,29,Berlin,62000
3,Linda,42,London,85000


In [5]:
data_list = [
    ['John', 28, 'New York', 65000],
    ['Anna', 34, 'Paris', 70000],
    ['Peter', 29, 'Berlin', 62000],
    ['Linda', 42, 'London', 85000]
]
columns = ["Name", "Age", "City", "Salary"]
df2 = pd.DataFrame(data_list, columns=columns)
df2

Unnamed: 0,Name,Age,City,Salary
0,John,28,New York,65000
1,Anna,34,Paris,70000
2,Peter,29,Berlin,62000
3,Linda,42,London,85000


In [None]:
# Selecting from a DataFrame

In [6]:
df2['Name']

0     John
1     Anna
2    Peter
3    Linda
Name: Name, dtype: object

In [7]:
df2[['Name', 'City']]

Unnamed: 0,Name,City
0,John,New York
1,Anna,Paris
2,Peter,Berlin
3,Linda,London


In [8]:
 # Creating a column

In [9]:
df2['Designation'] = ['Doctor', 'Engineer', 'Teacher', 'Lawyer']

In [10]:
df2

Unnamed: 0,Name,Age,City,Salary,Designation
0,John,28,New York,65000,Doctor
1,Anna,34,Paris,70000,Engineer
2,Peter,29,Berlin,62000,Teacher
3,Linda,42,London,85000,Lawyer


In [12]:
# Removing Columns

In [13]:
df2.drop('Designation', axis=1)

Unnamed: 0,Name,Age,City,Salary
0,John,28,New York,65000
1,Anna,34,Paris,70000
2,Peter,29,Berlin,62000
3,Linda,42,London,85000


In [14]:
df2

Unnamed: 0,Name,Age,City,Salary,Designation
0,John,28,New York,65000,Doctor
1,Anna,34,Paris,70000,Engineer
2,Peter,29,Berlin,62000,Teacher
3,Linda,42,London,85000,Lawyer


In [15]:
df2.drop('Designation', axis=1, inplace=True)

In [16]:
df2

Unnamed: 0,Name,Age,City,Salary
0,John,28,New York,65000
1,Anna,34,Paris,70000
2,Peter,29,Berlin,62000
3,Linda,42,London,85000


In [17]:
df2.drop(['City', 'Salary'], axis=1)

Unnamed: 0,Name,Age
0,John,28
1,Anna,34
2,Peter,29
3,Linda,42


In [18]:
df2

Unnamed: 0,Name,Age,City,Salary
0,John,28,New York,65000
1,Anna,34,Paris,70000
2,Peter,29,Berlin,62000
3,Linda,42,London,85000


In [19]:
# Selecting the Rows

In [21]:
df2.loc[0]

Name          John
Age             28
City      New York
Salary       65000
Name: 0, dtype: object

In [26]:
df2.loc[[0,1]]

Unnamed: 0,Name,Age,City,Salary
0,John,28,New York,65000
1,Anna,34,Paris,70000


In [33]:
df2.iloc[3]

Name       Linda
Age           42
City      London
Salary     85000
Name: 3, dtype: object

In [None]:
# SubSet Selection

In [35]:
df2[['City', 'Salary']].loc[[0,1]]

Unnamed: 0,City,Salary
0,New York,65000
1,Paris,70000


In [36]:
df2

Unnamed: 0,Name,Age,City,Salary
0,John,28,New York,65000
1,Anna,34,Paris,70000
2,Peter,29,Berlin,62000
3,Linda,42,London,85000


In [37]:
df2.loc[[2,3]][['Name', 'Age']]

Unnamed: 0,Name,Age
2,Peter,29
3,Linda,42


In [38]:
# Conditional Selection

In [39]:
# Return the people whose age is above 30

In [40]:
df2[df2['Age'] > 30]

Unnamed: 0,Name,Age,City,Salary
1,Anna,34,Paris,70000
3,Linda,42,London,85000


In [42]:
df2[(df2['Age'] > 30) & (df2['City'] == 'Paris')] 

Unnamed: 0,Name,Age,City,Salary
1,Anna,34,Paris,70000


In [43]:
# Find Missing Data

In [44]:
data = {
    'A': [1, 2, np.nan, 4, 5],
    'B': [1, 2, 3, 4, 5],
    'C': [1, 2, 3, np.nan, np.nan],
    'D': [1, np.nan, np.nan, np.nan, 5]
}
df = pd.DataFrame(data)

In [45]:
df

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,
2,,3,3.0,
3,4.0,4,,
4,5.0,5,,5.0


In [46]:
df.isna() #Column wise selection

Unnamed: 0,A,B,C,D
0,False,False,False,False
1,False,False,False,True
2,True,False,False,True
3,False,False,True,True
4,False,False,True,False


In [47]:
df.isna().sum()

A    1
B    0
C    2
D    3
dtype: int64

In [48]:
df.isna().any()

A     True
B    False
C     True
D     True
dtype: bool

In [49]:
# Removing the Missing Data

In [50]:
df.dropna() #Row Level Selection

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0


In [53]:
df.dropna(thresh=4)

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0


In [54]:
# Fill the missing data

In [56]:
df.fillna(0) #Column level selection

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,0.0
2,0.0,3,3.0,0.0
3,4.0,4,0.0,0.0
4,5.0,5,0.0,5.0


In [57]:
values = {'A': 0, 'B': 200, 'C': 300, 'D': 400}
df.fillna(value=values)

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,400.0
2,0.0,3,3.0,400.0
3,4.0,4,300.0,400.0
4,5.0,5,300.0,5.0


In [58]:
df

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,
2,,3,3.0,
3,4.0,4,,
4,5.0,5,,5.0


In [59]:
df.fillna(df.mean())

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,3.0
2,3.0,3,3.0,3.0
3,4.0,4,2.0,3.0
4,5.0,5,2.0,5.0
