<a href="https://colab.research.google.com/github/satyambhatt5/Deep_Learning-/blob/main/How_to_convert_Categorical_Missing_data_data_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Use "fit_transform" on training data, but "transform" (only) on testing/new data

import pandas as pd

X = pd.DataFrame({'Shape':['square', 'square', 'oval', 'circle'],
                  'Class': ['third', 'first', 'second', 'third'],
                  'Size': ['S', 'S', 'L', 'XL']})
# "Shape" is unordered, "Class" and "Size" are ordered

X


In [None]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
# left-to-right column order is alphabetical (circle, oval, square)
ohe = OneHotEncoder(sparse=False)

ohe.fit_transform(X[['Shape']])

In [None]:
# category ordering (within each feature) is defined by you
oe = OrdinalEncoder(categories=[['first', 'second', 'third'], ['S', 'M', 'L', 'XL']])
oe.fit_transform(X[['Class', 'Size']])


In [None]:
#Q: For a one-hot encoded feature, what can you do if new data contains categories that weren't seen during training?

#A: Set handle_unknown='ignore' to encode new categories as all zeros.

#See example 👇

#P.S. If you know all possible categories that might ever appear, you can instead specify the categories manually. handle_unknown='ignore' is useful specifically when you don't know all possible categories.

In [None]:
import pandas as pd
X = pd.DataFrame({'col':['A', 'B', 'C', 'B']})
X_new = pd.DataFrame({'col':['A', 'C', 'D']})

In [None]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')

In [None]:
X

In [None]:
# three columns represent categories A, B, and C
ohe.fit_transform(X[['col']])

In [None]:
X_new

In [None]:
# category D is encoded as all zeros
ohe.transform(X_new[['col']])

In [None]:
#When imputing missing values, you can preserve info about which values were missing and use THAT as a feature!

#Why? Sometimes there's a relationship between "missingness" and the target/label you are trying to predict.

import pandas as pd
import numpy as np

In [None]:
X = pd.DataFrame({'Age':[20, 30, 10, np.nan, 10]})


In [None]:
X


In [None]:
from sklearn.impute import SimpleImputer
# impute the mean
imputer = SimpleImputer()
imputer.fit_transform(X)

In [None]:
# impute the mean and add an indicator matrix (new in 0.21)
imputer = SimpleImputer(add_indicator=True)
imputer.fit_transform(X)

In [None]:
#make pipeline 


import pandas as pd
import numpy as np
train = pd.DataFrame({'feat1':[10, 20, np.nan, 2], 'feat2':[25., 20, 5, 3], 'label':['A', 'A', 'B', 'B']})
test = pd.DataFrame({'feat1':[30., 5, 15], 'feat2':[12, 10, np.nan]})
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
imputer = SimpleImputer()
clf = LogisticRegression()
# 2-step pipeline: impute missing values, then pass the results to the classifier
pipe = make_pipeline(imputer, clf)
train

In [None]:
features = ['feat1', 'feat2']
X, y = train[features], train['label']
X_new = test[features]
# pipeline applies the imputer to X before fitting the classifier
pipe.fit(X, y)

# pipeline applies the imputer to X_new before making predictions
# note: pipeline uses imputation values learned during the "fit" step
pipe.predict(X_new)

In [None]:
import pandas as pd
df = pd.read_csv('http://bit.ly/kaggletrain', nrows=6)
cols = ['SibSp', 'Fare', 'Age']
X = df[cols]
X

In [None]:
# new in 0.21, and still "experimental" so it must be enabled explicitly
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer


In [None]:
impute_it = IterativeImputer()
impute_it.fit_transform(X)

In [None]:
# new in 0.22
from sklearn.impute import KNNImputer
impute_knn = KNNImputer(n_neighbors=2)
impute_knn.fit_transform(X)

In [None]:
# Pipeline requires naming of steps, 
#make_pipeline does not.

import pandas as pd
df = pd.read_csv('http://bit.ly/kaggletrain', nrows=6)
cols = ['Embarked', 'Sex', 'Age', 'Fare']
X = df[cols]
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
ohe = OneHotEncoder()
imp = SimpleImputer()
clf = LogisticRegression()


In [None]:
#make_pipeline 
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
ct = make_column_transformer(
    (ohe, ['Embarked', 'Sex']),
    (imp, ['Age']),
    remainder='passthrough')
pipe = make_pipeline(ct, clf)
pipe


In [None]:
#pipe_line 

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
ct = ColumnTransformer(
    [('encoder', ohe, ['Embarked', 'Sex']),
     ('imputer', imp, ['Age'])],
    remainder='passthrough')
pipe = Pipeline([('preprocessor', ct), ('classifier', clf)])
pipe

Imp[https://nbviewer.jupyter.org/github/justmarkham/scikit-learn-tips](https://)link 