<a href="https://colab.research.google.com/github/shashankv05/Colab_Notebooks/blob/main/Cleaning_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

# Figure out why the data is missing
# This is the point at which we get into the part of data science that I like to call "data intution", by which I mean "really looking at your data and trying to figure out why it is the way it is and how that will affect your analysis". It can be a frustrating part of data science, especially if you're newer to the field and don't have a lot of experience. For dealing with missing values, you'll need to use your intution to figure out why the value is missing. One of the most important questions you can ask yourself to help figure this out is this:

# Is this value missing because it wasn't recorded or because it doesn't exist?

# If a value is missing becuase it doesn't exist (like the height of the oldest child of someone who doesn't have any children) then it doesn't make sense to try and guess what it might be. These values you probably do want to keep as NaN. On the other hand, if a value is missing because it wasn't recorded, then you can try to guess what it might have been based on the other values in that column and row. This is called imputation, and we'll learn how to do it next! :)

In [None]:
type(np.nan), type(None)

(float, NoneType)

In [None]:
pd.isnull(np.nan) , pd.notnull(np.nan)

(True, False)

In [None]:
pd.isnull(None), pd.notnull(None)

(True, False)

In [None]:
pd.notnull(5)

True

In [None]:
pd.isnull(pd.Series([

    1, np.nan, 4, 5
])
)

0    False
1     True
2    False
3    False
dtype: bool

In [None]:
pd.notnull(pd.Series([
    1, 4, 5, np.nan, 7

])
)

0     True
1     True
2     True
3    False
4     True
dtype: bool

In [None]:
pd.isnull(pd.DataFrame({
    'Column_A': [1, np.nan, 7],
    'Column_B': [np.nan, 4, 5],
    'Column_C':[np.nan, 4, 1]

})
)

Unnamed: 0,Column_A,Column_B,Column_C
0,False,True,True
1,True,False,False
2,False,False,False


In [None]:
series  =  pd.Series([1,2, np.nan, 45, 7,  1, np.nan, 87])

In [None]:
series

0     1.0
1     2.0
2     NaN
3    45.0
4     7.0
5     1.0
6     NaN
7    87.0
dtype: float64

In [None]:
pd.notnull(series).sum()  # Every True is 1 and False is 0 and that's how we can sum 

6

In [None]:
pd.isnull(series).sum()

2

In [None]:
pd.notnull(series), series[pd.notnull(series)]

(0     True
 1     True
 2    False
 3     True
 4     True
 5     True
 6    False
 7     True
 dtype: bool,
 0     1.0
 1     2.0
 3    45.0
 4     7.0
 5     1.0
 7    87.0
 dtype: float64)

In [None]:
series.isnull()

0    False
1    False
2     True
3    False
4    False
5    False
6     True
7    False
dtype: bool

In [None]:
series.notnull()

0     True
1     True
2    False
3     True
4     True
5     True
6    False
7     True
dtype: bool

In [None]:
series[series.notnull()]

0     1.0
1     2.0
3    45.0
4     7.0
5     1.0
7    87.0
dtype: float64

In [None]:
series.dropna()    # Dropping all the na values

0     1.0
1     2.0
3    45.0
4     7.0
5     1.0
7    87.0
dtype: float64

In [None]:
df = pd.DataFrame({
    'Column_A' : [1, np.nan, 7, 5],
    'Column_B' : [np.nan , np.nan, 1, 2],
    'Column_C ' : [9,5,1,7],
    'Column_D' : [1,6,np.nan,5]
})

In [None]:
index = df.Column_A > 2
# index
# df[index]
df.Column_A.loc[index]

2    7.0
3    5.0
Name: Column_A, dtype: float64

In [None]:
# df.isnull()
# df[df['Column_A'].notnull()]
# df.loc[0:4,'Column_A'].notnull()
# df['Column_A'].isnull().sum()    #Number of null values in column_A
df[df['Column_A'].notnull() & df['Column_B'].notnull()]  # Extract all the rows for which column_A & B                                                                 have  notnull values 


Unnamed: 0,Column_A,Column_B,Column_C,Column_D
2,7.0,1.0,1,
3,5.0,2.0,7,5.0


In [None]:
missing_values = df.isnull().sum()
missing_values
# missing_values.sum()  #Total Missing Values

Column_A     1
Column_B     2
Column_C     0
Column_D     1
dtype: int64

In [None]:
total_missing_values = missing_values.sum()
# print(total_missing_values)

total_cells = np.product(df.shape)
# print(total_cells)

percent_missing_values = (total_missing_values / total_cells) * 100
print(percent_missing_values, "%")

# df.shape, df.isnull().sum()[0]
# for i in range(0,df.shape[1]):
#      print(df.isnull().sum()[i])

25.0 %


In [None]:
df.notnull()

Unnamed: 0,Column_A,Column_B,Column_C,Column_D
0,True,False,True,True
1,False,False,True,True
2,True,True,True,False
3,True,True,True,True


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Column_A   3 non-null      float64
 1   Column_B   2 non-null      float64
 2   Column_C   4 non-null      int64  
 3   Column_D   3 non-null      float64
dtypes: float64(3), int64(1)
memory usage: 256.0 bytes


In [None]:
df.dropna()

Unnamed: 0,Column_A,Column_B,Column_C,Column_D
3,5.0,2.0,7,5.0


In [None]:
df.dropna(axis=1)

Unnamed: 0,Column_C
0,9
1,5
2,1
3,7


In [None]:
df.dropna(how='all')  # if all the column values are null for a row then drop that row

Unnamed: 0,Column_A,Column_B,Column_C,Column_D
0,1.0,,9,1.0
1,,,5,6.0
2,7.0,1.0,1,
3,5.0,2.0,7,5.0


In [None]:
df.dropna(how='any', axis=1) # if any of the column value is null drop that column

Unnamed: 0,Column_C
0,9
1,5
2,1
3,7


In [None]:
df.dropna(thresh = 1)  # thresh = 1 says , I need only 1 not-null value to have the row

Unnamed: 0,Column_A,Column_B,Column_C,Column_D
0,1.0,,9,1.0
1,,,5,6.0
2,7.0,1.0,1,
3,5.0,2.0,7,5.0


In [None]:
df.dropna(thresh= 4)  # Here all the 4 values need not to be null

Unnamed: 0,Column_A,Column_B,Column_C,Column_D
3,5.0,2.0,7,5.0


In [None]:
df.dropna(thresh = 3 , axis = 'rows')
# df.dropna(thresh = 3 , axis = 'columns')

Unnamed: 0,Column_A,Column_B,Column_C,Column_D
0,1.0,,9,1.0
2,7.0,1.0,1,
3,5.0,2.0,7,5.0


In [None]:
series  =  pd.Series([1,2, np.nan, 45, 7,  1, np.nan, 87])

In [None]:
# series.fillna(0, inplace= True)
series.fillna(0)

0     1.0
1     2.0
2     0.0
3    45.0
4     7.0
5     1.0
6     0.0
7    87.0
dtype: float64

In [None]:
series

0     1.0
1     2.0
2     NaN
3    45.0
4     7.0
5     1.0
6     NaN
7    87.0
dtype: float64

In [None]:
series.fillna(series.mean())

0     1.000000
1     2.000000
2    23.833333
3    45.000000
4     7.000000
5     1.000000
6    23.833333
7    87.000000
dtype: float64

In [None]:
series

0     1.0
1     2.0
2     NaN
3    45.0
4     7.0
5     1.0
6     NaN
7    87.0
dtype: float64

In [None]:
series.fillna(method = 'ffill') #forward fill

0     1.0
1     2.0
2     2.0
3    45.0
4     7.0
5     1.0
6     1.0
7    87.0
dtype: float64

In [None]:
series.fillna(method = 'bfill')  # backward fill

0     1.0
1     2.0
2    45.0
3    45.0
4     7.0
5     1.0
6    87.0
7    87.0
dtype: float64

In [None]:
pd.Series([np.nan, 1, 4, 5, 7,]).fillna(method= 'ffill') # the first index item will remain NaN

0    NaN
1    1.0
2    4.0
3    5.0
4    7.0
dtype: float64

In [None]:
pd.Series([1,4,5,7, np.nan]).fillna(method= 'bfill')  # the last index iem will remain as NaN

0    1.0
1    4.0
2    5.0
3    7.0
4    NaN
dtype: float64

In [None]:
# df.loc[:,'Column_B':'Column_D']
df

Unnamed: 0,Column_A,Column_B,Column_C,Column_D
0,1.0,,9,1.0
1,,,5,6.0
2,7.0,1.0,1,
3,5.0,2.0,7,5.0


In [None]:
df.fillna(method= 'bfill', axis=0)  # By default axis = 0  row wise filling in upward direction

Unnamed: 0,Column_A,Column_B,Column_C,Column_D
0,1.0,1.0,9,1.0
1,7.0,1.0,5,6.0
2,7.0,1.0,1,5.0
3,5.0,2.0,7,5.0


In [None]:
df

Unnamed: 0,Column_A,Column_B,Column_C,Column_D
0,1.0,,9,1.0
1,,,5,6.0
2,7.0,1.0,1,
3,5.0,2.0,7,5.0


In [None]:
df.fillna(method= 'ffill', axis = 1)  # axis = 1  --column wise horizontal filling

Unnamed: 0,Column_A,Column_B,Column_C,Column_D
0,1.0,1.0,9.0,1.0
1,,,5.0,6.0
2,7.0,1.0,1.0,1.0
3,5.0,2.0,7.0,5.0


In [None]:
df.fillna({'Column_A' : 0 , 'Column_B': 45, 'Column_D': df['Column_D'].mean()})

Unnamed: 0,Column_A,Column_B,Column_C,Column_D
0,1.0,45.0,9,1.0
1,0.0,45.0,5,6.0
2,7.0,1.0,1,4.0
3,5.0,2.0,7,5.0


In [None]:
series.dropna().count()   # After dropping the NaN values, count() will return the total elements

6

In [None]:
missing_Values = len(series) - series.count()

In [None]:
missing_Values

2

In [None]:
pd.Series([True, False, True, False]).any()  # Return True if atleast 1 element in the Series is True

True

In [None]:
pd.Series([True,False]).all()

False

In [None]:
pd.Series([True,True]).all()

True

In [None]:
pd.Series([1,2]).isnull().any()

False

In [None]:
series.isnull().any()

True

In [None]:
df = pd.DataFrame({
    'Sex': ['M','F','F','D','?'],
    'Age':[21,23,21,250,24]
})

In [None]:
df

Unnamed: 0,Sex,Age
0,M,21
1,F,23
2,F,21
3,D,250
4,?,24


In [None]:
# df.loc[3,['Sex']] = 'M'

In [None]:
df['Sex'].unique()

array(['M', 'F', 'D', '?'], dtype=object)

In [None]:
df['Sex'].value_counts()

F    2
M    1
D    1
?    1
Name: Sex, dtype: int64

In [None]:
df['Sex'].replace('D','F')

0    M
1    F
2    F
3    F
4    ?
Name: Sex, dtype: object

In [None]:
df['Sex'].replace({
    'D':'F',
    '?':'M'
})

0    M
1    F
2    F
3    F
4    M
Name: Sex, dtype: object

In [None]:
df = df.replace({
    'Sex':{
        'D':'F',
        '?':'M'
    },
    'Age':{
        251 : 25
    }

})

In [None]:
df['Age'] > 23

0    False
1    False
2    False
3     True
4     True
Name: Age, dtype: bool

In [None]:
df.loc[df['Age'] > 23]

Unnamed: 0,Sex,Age
3,F,250
4,M,24


In [None]:
df.loc[df['Age'] > 23, 'Sex']

3    F
4    M
Name: Sex, dtype: object

In [None]:
df.loc[df['Age'] > 100, 'Age'] 

3    250
Name: Age, dtype: int64

In [None]:
df.loc[df['Age'] > 100 , 'Age'] = df.loc[df['Age'] > 100 ,'Age'] / 10

In [None]:
df = df.astype({'Age': int})

In [None]:
df['Age']

0    21
1    23
2    21
3    25
4    24
Name: Age, dtype: int32

In [None]:
df.iloc[0 : len(df),[1]]

Unnamed: 0,Age
0,21
1,23
2,21
3,25
4,24


In [None]:
ambassadors =  pd.Series([

    'India',
    'United States',
    'Japan',
    'United States',
    'India',
    'United States',
    'India'
],
    index = [
        'Shashank Verma',
        'Elon Musk',
        'Michio Kaku',
        'Robert Jr.',
        'Sarita Verma',
        'Angelina',
        'Rajesh Kumar Verma'
    ]
)

In [None]:
ambassadors.duplicated()

Shashank Verma        False
Elon Musk             False
Michio Kaku           False
Robert Jr.             True
Sarita Verma           True
Angelina               True
Rajesh Kumar Verma     True
dtype: bool

In [None]:
ambassadors.duplicated(keep='last')

Shashank Verma         True
Elon Musk              True
Michio Kaku           False
Robert Jr.             True
Sarita Verma           True
Angelina              False
Rajesh Kumar Verma    False
dtype: bool

In [None]:
ambassadors.duplicated(keep=False)    #

Shashank Verma         True
Elon Musk              True
Michio Kaku           False
Robert Jr.             True
Sarita Verma           True
Angelina               True
Rajesh Kumar Verma     True
dtype: bool

In [None]:
ambassadors.drop_duplicates()

Shashank Verma            India
Elon Musk         United States
Michio Kaku               Japan
dtype: object

In [None]:
ambassadors.drop_duplicates(keep='last')

Michio Kaku                   Japan
Angelina              United States
Rajesh Kumar Verma            India
dtype: object

In [None]:
ambassadors.drop_duplicates(keep=False)

Michio Kaku    Japan
dtype: object

In [None]:
df = pd.DataFrame({
    'Players' : [
        'Shashank', 'Shreyank', 'MS Dhoni', 'Shashank', 'Yuvraj', 'Shashank', 'Shashank'
    ],

    'Role' : ['Batsman', 'Bowler', 'Batsman/WK', 'Batsman/WK', 'All-Rounder' ,'All-Rounder' , 'Batsman/WK']

})

In [None]:
df

Unnamed: 0,Players,Role
0,Shashank,Batsman
1,Shreyank,Bowler
2,MS Dhoni,Batsman/WK
3,Shashank,Batsman/WK
4,Yuvraj,All-Rounder
5,Shashank,All-Rounder
6,Shashank,Batsman/WK


In [None]:
df.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [None]:
df.duplicated(subset= ['Players'])

0    False
1    False
2    False
3     True
4    False
5     True
6     True
dtype: bool

In [None]:
df.duplicated(subset = ['Role'])

0    False
1    False
2    False
3     True
4    False
5     True
6     True
dtype: bool

In [None]:
df =  pd.DataFrame({
    'Data' : [
        '1993?_M_IND_2',
        '1997_F_IND_2',
        '1993_F_IND_1',
        '1996?_F_I ND_1'
    ]
})

In [None]:
df

Unnamed: 0,Data
0,1993?_M_IND_2
1,1997_F_IND_2
2,1993_F_IND_1
3,1996?_F_I ND_1


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Data    4 non-null      object
dtypes: object(1)
memory usage: 160.0+ bytes


In [None]:
df['Data'].str.split('_')

0     [1993?, M, IND, 2]
1      [1997, F, IND, 2]
2      [1993, F, IND, 1]
3    [1996?, F, I ND, 1]
Name: Data, dtype: object

In [None]:
df = df['Data'].str.split('_', expand=True)

In [None]:
df.columns = ['Year', 'Sex', 'Country' , 'No of Children']

In [None]:
df

Unnamed: 0,Year,Sex,Country,No of Children
0,1993?,M,IND,2
1,1997,F,IND,2
2,1993,F,IND,1
3,1996?,F,I ND,1


In [None]:
df['Year'].str.contains('\?')   # ? is a Regular Expression  so we need to write like \? within contains method 

0    False
1    False
2    False
3    False
Name: Year, dtype: bool

In [None]:
df['Country'].str.contains('I')

0    True
1    True
2    True
3    True
Name: Country, dtype: bool

In [None]:
# Check how to use str.strip()

In [None]:
df['Country'] = df['Country'].str.replace(' ','')

In [None]:
df['Country']

0    IND
1    IND
2    IND
3    IND
Name: Country, dtype: object

In [None]:
df['Year'] = df['Year'].str.replace(r'(?P<year>\d{4})\?', lambda m: m.group('year'))  # P here is positional argument


In [None]:
df['Year']

0    1993
1    1997
2    1993
3    1996
Name: Year, dtype: object