# Data Preprocessing - Part 1

Hi Guys, Welcome to [Tirendaz Academy](https://youtube.com/c/tirendazacademy) 😀
</br>
In this notebook, I'm going to talk about data preprocessing.
</br>
Happy learning 🐱‍🏍 

## Handing Missing Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = np.array([[1,2,3,4],[5,6,7,np.nan], [9,10,np.nan,11]])

In [3]:
df = pd.DataFrame(data, columns = ["A", "B", "C", "D"])

In [4]:
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,
2,9.0,10.0,,11.0


In [5]:
df.isnull()

Unnamed: 0,A,B,C,D
0,False,False,False,False
1,False,False,False,True
2,False,False,True,False


In [6]:
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

In [7]:
df.dropna(axis=0)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [8]:
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,9.0,10.0


In [9]:
df.dropna(how = "all")

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,
2,9.0,10.0,,11.0


In [10]:
df.dropna(thresh = 4)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [11]:
df.dropna(subset= ["D"])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,9.0,10.0,,11.0


In [12]:
from sklearn.impute import SimpleImputer

In [13]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy="mean")

In [14]:
imp_mean = imp_mean.fit(df.values)

In [15]:
imputed_data = imp_mean.transform(df.values)

In [16]:
imputed_data

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7. ,  7.5],
       [ 9. , 10. ,  5. , 11. ]])

In [17]:
df.fillna(df.mean())

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,7.5
2,9.0,10.0,5.0,11.0


## Handing Categorical Data

In [17]:
df = pd.DataFrame([["red", "L", 5.0, "class1"],
                  ["blue", "XL", 2.0, "class2"],
                  ["black", "M", 3.0, "class1"]])
df.columns = ["color", "size", "age", "classlabel"]
df

Unnamed: 0,color,size,age,classlabel
0,red,L,5.0,class1
1,blue,XL,2.0,class2
2,black,M,3.0,class1


In [18]:
df.dtypes

color          object
size           object
age           float64
classlabel     object
dtype: object

In [19]:
size_mapping = {"XL":3, "L":2, "M":1}
df["size"] = df["size"].map(size_mapping)
df

Unnamed: 0,color,size,age,classlabel
0,red,2,5.0,class1
1,blue,3,2.0,class2
2,black,1,3.0,class1


In [21]:
class_mapping = {label: idx for idx, label in enumerate(np.unique(df["classlabel"]))}

In [22]:
class_mapping

{'class1': 0, 'class2': 1}

In [23]:
df["classlabel"] = df["classlabel"].map(class_mapping)
df

Unnamed: 0,color,size,age,classlabel
0,red,2,5.0,0
1,blue,3,2.0,1
2,black,1,3.0,0


In [24]:
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
y = class_le.fit_transform(df["classlabel"].values)
y

array([0, 1, 0], dtype=int64)

In [26]:
X = df[["color","size", "age"]].values

In [27]:
color_le = LabelEncoder()
X[:,0] = color_le.fit_transform(X[:,0])

In [28]:
X

array([[2, 2, 5.0],
       [1, 3, 2.0],
       [0, 1, 3.0]], dtype=object)

In [30]:
from sklearn.preprocessing import OneHotEncoder

In [31]:
color_ohe=OneHotEncoder()

In [33]:
X[:,0].reshape(-1,1)

array([[2],
       [1],
       [0]], dtype=object)

In [34]:
color_ohe.fit_transform(X[:,0].reshape(-1,1)).toarray()

array([[0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [35]:
from sklearn.compose import ColumnTransformer

In [36]:
c_transf = ColumnTransformer([
    ("onehot", OneHotEncoder(), [0]),
    ("nothing", "passthrough", [1,2])
])

In [37]:
X

array([[2, 2, 5.0],
       [1, 3, 2.0],
       [0, 1, 3.0]], dtype=object)

In [38]:
c_transf.fit_transform(X).astype(float)

array([[0., 0., 1., 2., 5.],
       [0., 1., 0., 3., 2.],
       [1., 0., 0., 1., 3.]])

In [39]:
pd.get_dummies(df[["color", "size", "age"]])

Unnamed: 0,size,age,color_black,color_blue,color_red
0,2,5.0,0,0,1
1,3,2.0,0,1,0
2,1,3.0,1,0,0


In [41]:
pd.get_dummies(df[["color", "size", "age"]], drop_first = True)

Unnamed: 0,size,age,color_blue,color_red
0,2,5.0,0,1
1,3,2.0,1,0
2,1,3.0,0,0


In [44]:
color_ohe=OneHotEncoder(categories="auto", drop="first")
c_transf = ColumnTransformer([
    ("onehot", color_ohe, [0]),
    ("nothing", "passthrough", [1,2])
])
c_transf.fit_transform(X).astype(float)

array([[0., 1., 2., 5.],
       [1., 0., 3., 2.],
       [0., 0., 1., 3.]])

Don't forget to follow us on [YouTube](http://youtube.com/tirendazacademy) | [Medium](http://tirendazacademy.medium.com) | [Twitter](http://twitter.com/tirendazacademy) | [GitHub](http://github.com/tirendazacademy) | [Linkedin](https://www.linkedin.com/in/tirendaz-academy) | [Kaggle](https://www.kaggle.com/tirendazacademy) 😎