**Data Preprocessing**

In [None]:
import numpy as np
import pandas as pd

In [None]:
s = pd.Series(['10','20','30'])
s

0    10
1    20
2    30
dtype: object

In [None]:
#change data type
s.astype(int)

0    10
1    20
2    30
dtype: int64

In [None]:
#Load data
#country , age and salary
data = [['India',np.nan,68000.0],
    ['France',43.0,45000.0],
 [np.nan,30.0, 54000.0],
 ['France' ,48.0, 65000.0],
 ['Germany' ,40.0, np. nan],
 ['India' ,35.0, 58000.0],
 ['Germany', np.nan ,53000.0],
 ['France' ,49.0, 79000.0],
 ['India', 50.0 ,88000.0],
 [np.nan ,37.0, np.nan]]


In [None]:
data

[['India', nan, 68000.0],
 ['France', 43.0, 45000.0],
 [nan, 30.0, 54000.0],
 ['France', 48.0, 65000.0],
 ['Germany', 40.0, nan],
 ['India', 35.0, 58000.0],
 ['Germany', nan, 53000.0],
 ['France', 49.0, 79000.0],
 ['India', 50.0, 88000.0],
 [nan, 37.0, nan]]

In [None]:
#converting data into dataframe
df = pd.DataFrame(data)
df

Unnamed: 0,0,1,2
0,India,,68000.0
1,France,43.0,45000.0
2,,30.0,54000.0
3,France,48.0,65000.0
4,Germany,40.0,
5,India,35.0,58000.0
6,Germany,,53000.0
7,France,49.0,79000.0
8,India,50.0,88000.0
9,,37.0,


In [None]:
#displaying first five record
df.head()

Unnamed: 0,0,1,2
0,India,,68000.0
1,France,43.0,45000.0
2,,30.0,54000.0
3,France,48.0,65000.0
4,Germany,40.0,


In [None]:
#convert dataframe to csv
df.to_csv('csv_file_conversion')

In [None]:
#check null values
df.isna().sum()

0    2
1    2
2    2
dtype: int64

#Ways to handle missing data

In [None]:
df.fillna('missing')

Unnamed: 0,0,1,2
0,India,missing,68000.0
1,France,43.0,45000.0
2,missing,30.0,54000.0
3,France,48.0,65000.0
4,Germany,40.0,missing
5,India,35.0,58000.0
6,Germany,missing,53000.0
7,France,49.0,79000.0
8,India,50.0,88000.0
9,missing,37.0,missing


In [None]:
# only use first column to fill missing entries
df.iloc[:,0].fillna('unknown')

0      India
1     France
2    unknown
3     France
4    Germany
5      India
6    Germany
7     France
8      India
9    unknown
Name: 0, dtype: object

In [None]:
# ffill/pad---> forward filling
df.iloc[:,0].fillna(method='ffill')

0      India
1     France
2     France
3     France
4    Germany
5      India
6    Germany
7     France
8      India
9      India
Name: 0, dtype: object

In [None]:
df.iloc[:,1].fillna(method='ffill')

0     NaN
1    43.0
2    30.0
3    48.0
4    40.0
5    35.0
6    35.0
7    49.0
8    50.0
9    37.0
Name: 1, dtype: float64

In [None]:
df.iloc[:,1].fillna(method='pad')

0     NaN
1    43.0
2    30.0
3    48.0
4    40.0
5    35.0
6    35.0
7    49.0
8    50.0
9    37.0
Name: 1, dtype: float64

In [None]:
# bfill/backfill --> backward filling means fill Nan by taking bottom value
df.iloc[:,1].fillna(method='bfill')

0    43.0
1    43.0
2    30.0
3    48.0
4    40.0
5    35.0
6    49.0
7    49.0
8    50.0
9    37.0
Name: 1, dtype: float64

In [None]:
df.iloc[:,2].fillna(method='bfill')

0    68000.0
1    45000.0
2    54000.0
3    65000.0
4    58000.0
5    58000.0
6    53000.0
7    79000.0
8    88000.0
9        NaN
Name: 2, dtype: float64

In [None]:
df.iloc[:,1].fillna(method='backfill')

0    43.0
1    43.0
2    30.0
3    48.0
4    40.0
5    35.0
6    49.0
7    49.0
8    50.0
9    37.0
Name: 1, dtype: float64

In [None]:
df.iloc[:,0].fillna(method='backfill')

0      India
1     France
2     France
3     France
4    Germany
5      India
6    Germany
7     France
8      India
9        NaN
Name: 0, dtype: object

In [None]:
df.iloc[:,0].value_counts()

India      3
France     3
Germany    2
Name: 0, dtype: int64

In [None]:
df.iloc[:,0].mode()

0    France
1     India
Name: 0, dtype: object

In [None]:
df.iloc[:,0].fillna('India')

0      India
1     France
2      India
3     France
4    Germany
5      India
6    Germany
7     France
8      India
9      India
Name: 0, dtype: object

## SimpleImputer

In [None]:
#handling missing data (Replacing missing data with the mean value)
from sklearn.impute import SimpleImputer
si = SimpleImputer()


In [None]:
#Fitting imputer object to the independent variables x with numeric dtype
ndf = df.select_dtypes(exclude='object')
ndf

Unnamed: 0,1,2
0,,68000.0
1,43.0,45000.0
2,30.0,54000.0
3,48.0,65000.0
4,40.0,
5,35.0,58000.0
6,,53000.0
7,49.0,79000.0
8,50.0,88000.0
9,37.0,


In [None]:
si.fit(ndf)
si.transform(ndf)

array([[4.150e+01, 6.800e+04],
       [4.300e+01, 4.500e+04],
       [3.000e+01, 5.400e+04],
       [4.800e+01, 6.500e+04],
       [4.000e+01, 6.375e+04],
       [3.500e+01, 5.800e+04],
       [4.150e+01, 5.300e+04],
       [4.900e+01, 7.900e+04],
       [5.000e+01, 8.800e+04],
       [3.700e+01, 6.375e+04]])

In [None]:
ndf.mean()

1       41.5
2    63750.0
dtype: float64

In [None]:
si.missing_values

nan

In [None]:
si.n_features_in_

2

In [None]:
si.strategy

'mean'

In [None]:
ndf

Unnamed: 0,1,2
0,,68000.0
1,43.0,45000.0
2,30.0,54000.0
3,48.0,65000.0
4,40.0,
5,35.0,58000.0
6,,53000.0
7,49.0,79000.0
8,50.0,88000.0
9,37.0,
