# ColumnTransformer

Applies transformers to columns of an array or pandas DataFrame. This estimator allows different columns or column subsets of the input to be transformed separately and the features generated by each transformer will be concatenated to form a single feature space.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/students-performance-in-exams/StudentsPerformance.csv')
df

# Describing Null values

In [None]:
df.isnull().sum()

# Assigning the Target variables from dataset

In [None]:
x = df.drop(columns=['test preparation course'])
y = df['test preparation course']
print(x)
print(y)

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# ColumnTransformer

In [None]:
columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(), [0])], remainder='passthrough')
columnTransformer

In [None]:
dataset = np.array(columnTransformer.fit_transform(x,y), dtype = np.str)
dataset

# Pros

* The ColumnTransformer helps performing different transformations for different columns of the data, within a Pipeline that is safe from data leakage and that can be parametrized.
* The make_column_transformer function is available to more easily create a ColumnTransformer object. Specifically, the names will be given automatically. 

# Cons

* Incorporating statistics from test data into the preprocessors makes cross-validation scores unreliable (known as data leakage), for example in the case of scalers or imputing missing values.
* You may want to include the parameters of the preprocessors in a parameter search.

# Reference

https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html
https://towardsdatascience.com/using-columntransformer-to-combine-data-processing-steps-af383f7d5260