###Handling of Missing values in a dataset 

In [39]:
import numpy as np
import pandas as pd

In [40]:
my_mat = pd.DataFrame(
    np.array([5,7,8, np.NaN, np.NaN, np.NaN, -5,
              11,24,19,12,-14, np.NaN, 17, np.NaN,np.NaN,np.NaN,np.NaN,12,13,14])\
              .reshape((7,3)))
my_mat.columns = ['feature1', 'feature2', 'feature3']

In [41]:
my_mat

Unnamed: 0,feature1,feature2,feature3
0,5.0,7.0,8.0
1,,,
2,-5.0,11.0,24.0
3,19.0,12.0,-14.0
4,,17.0,
5,,,
6,12.0,13.0,14.0


In [42]:
my_mat.dropna(axis=0, thresh=1, inplace=True)

In [43]:
my_mat

Unnamed: 0,feature1,feature2,feature3
0,5.0,7.0,8.0
2,-5.0,11.0,24.0
3,19.0,12.0,-14.0
4,,17.0,
6,12.0,13.0,14.0


In [44]:
my_mat.reset_index(inplace=True)

In [45]:
my_mat

Unnamed: 0,index,feature1,feature2,feature3
0,0,5.0,7.0,8.0
1,2,-5.0,11.0,24.0
2,3,19.0,12.0,-14.0
3,4,,17.0,
4,6,12.0,13.0,14.0


In [46]:
my_mat.drop(['index'], axis=1, inplace=True)

In [47]:
my_mat

Unnamed: 0,feature1,feature2,feature3
0,5.0,7.0,8.0
1,-5.0,11.0,24.0
2,19.0,12.0,-14.0
3,,17.0,
4,12.0,13.0,14.0


In [48]:
from sklearn.impute import MissingIndicator
indicator = MissingIndicator(missing_values=np.NaN)
indicator = indicator.fit_transform(my_mat)
indicator = pd.DataFrame(indicator)

In [49]:
indicator

Unnamed: 0,0,1
0,False,False
1,False,False
2,False,False
3,True,True
4,False,False


In [50]:
my_mat.isnull().values.any()#for entire dataframe (Status:True/False)

True

In [51]:
check_nan_in_df = my_mat.isnull()
print (check_nan_in_df)

   feature1  feature2  feature3
0     False     False     False
1     False     False     False
2     False     False     False
3      True     False      True
4     False     False     False


For filling up missing values with common strategies, sklearn provides a SimpleImputer. The four main strategies are mean, median, most_frequent, and constant. In the example below we impute missing values for our dataframe my_mat with the feature’s mean.

In [52]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.NaN, strategy='mean')
zc=imp.fit_transform(my_mat)
print(zc)

[[  5.     7.     8.  ]
 [ -5.    11.    24.  ]
 [ 19.    12.   -14.  ]
 [  7.75  17.     8.  ]
 [ 12.    13.    14.  ]]


In [53]:
(5-5+19+12)/4

7.75

In [54]:
(8+24-14+14)/4

8.0

In [None]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.NaN, strategy='median')
z=imp.fit_transform(my_mat)
# http://www.alcula.com/calculators/statistics/median/

In [None]:
z

array([[  5. ,   7. ,   8. ],
       [ -5. ,  11. ,  24. ],
       [ 19. ,  12. , -14. ],
       [  8.5,  17. ,  11. ],
       [ 12. ,  13. ,  14. ]])

In [None]:
df_imp = pd.DataFrame(z, columns = ['f1','f2','f3'])

In [None]:
df_imp

Unnamed: 0,f1,f2,f3
0,5.0,7.0,8.0
1,-5.0,11.0,24.0
2,19.0,12.0,-14.0
3,8.5,17.0,11.0
4,12.0,13.0,14.0


##Polynomial features

In [55]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2, interaction_only=True)
polynomials = pd.DataFrame(poly.fit_transform(z))
print(polynomials)

     0     1     2     3      4      5      6
0  1.0   5.0   7.0   8.0   35.0   40.0   56.0
1  1.0  -5.0  11.0  24.0  -55.0 -120.0  264.0
2  1.0  19.0  12.0 -14.0  228.0 -266.0 -168.0
3  1.0   8.5  17.0  11.0  144.5   93.5  187.0
4  1.0  12.0  13.0  14.0  156.0  168.0  182.0


In [None]:
Total_mat_poly = pd.concat([df_imp, polynomials], axis=1)

In [None]:
Total_mat_poly

Unnamed: 0,f1,f2,f3,0,1,2,3,4,5,6
0,5.0,7.0,8.0,1.0,5.0,7.0,8.0,35.0,40.0,56.0
1,-5.0,11.0,24.0,1.0,-5.0,11.0,24.0,-55.0,-120.0,264.0
2,19.0,12.0,-14.0,1.0,19.0,12.0,-14.0,228.0,-266.0,-168.0
3,8.5,17.0,11.0,1.0,8.5,17.0,11.0,144.5,93.5,187.0
4,12.0,13.0,14.0,1.0,12.0,13.0,14.0,156.0,168.0,182.0


Another example

In [None]:
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
X = np.arange(6).reshape(3, 2)
X

array([[0, 1],
       [2, 3],
       [4, 5]])

In [None]:
poly = PolynomialFeatures(2)
poly.fit_transform(X)

array([[ 1.,  0.,  1.,  0.,  0.,  1.],
       [ 1.,  2.,  3.,  4.,  6.,  9.],
       [ 1.,  4.,  5., 16., 20., 25.]])

In [None]:
poly = PolynomialFeatures(interaction_only=True)
poly.fit_transform(X)

array([[ 1.,  0.,  1.,  0.],
       [ 1.,  2.,  3.,  6.],
       [ 1.,  4.,  5., 20.]])

###Handling Categorical data

In [None]:
import pandas as pd
import numpy as np

In [None]:
X = pd.DataFrame(
    np.array(['M', 'O-', 'medium',
             'M', 'O-', 'high',
              'F', 'O+', 'high',
              'F', 'AB', 'low',
              'F', 'B+', 'low'])
              .reshape((5,3)))

In [None]:
X

Unnamed: 0,0,1,2
0,M,O-,medium
1,M,O-,high
2,F,O+,high
3,F,AB,low
4,F,B+,low


In [None]:
X.columns = ['gender', 'blood_group', 'inc_level']

In [None]:
X

Unnamed: 0,gender,blood_group,inc_level
0,M,O-,medium
1,M,O-,high
2,F,O+,high
3,F,AB,low
4,F,B+,low


In [None]:
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
X.inc_level = encoder.fit_transform(X.inc_level.values.reshape(-1, 1))

In [None]:
X

Unnamed: 0,gender,blood_group,inc_level
0,M,O-,2.0
1,M,O-,0.0
2,F,O+,0.0
3,F,AB,1.0
4,F,B+,1.0


In [None]:
from sklearn.preprocessing import OneHotEncoder
onehot = OneHotEncoder(dtype=np.int, sparse=True)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  onehot = OneHotEncoder(dtype=np.int, sparse=True)


In [None]:
nominals = pd.DataFrame(
    onehot.fit_transform(X[['gender', 'blood_group']])\
    .toarray(),
    columns=['F', 'M', 'AB', 'B+','O+', 'O-'])



In [None]:
nominals['inc_level'] = X.inc_level

In [None]:
nominals

Unnamed: 0,F,M,AB,B+,O+,O-,inc_level
0,0,1,0,0,0,1,2.0
1,0,1,0,0,0,1,0.0
2,1,0,0,0,1,0,0.0
3,1,0,1,0,0,0,1.0
4,1,0,0,1,0,0,1.0


## let's experiment on a dataset 

In [None]:
# Import the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Import the dataset 
dataset = pd.read_csv('data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 3].values

FileNotFoundError: ignored

In [None]:
dataset

In [None]:
X

In [None]:
y

In [None]:
# Taking care of missing data
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.NaN, strategy='mean')

imputer=imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [None]:
X

In [None]:
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [None]:
labelencoder_X = LabelEncoder()
X[:,0] = labelencoder_X.fit_transform(X[:,0])

In [None]:
X

In [None]:
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

In [None]:
y

In [None]:
# Splitting the dataset into the training set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,\
                                                    test_size=0.2,random_state=42)

In [None]:
X_train

In [None]:
X_test

In [None]:
# Feature scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [None]:
X_train

In [None]:
X_test