In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler, FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier

In [20]:
df = pd.read_csv('data/Fake.csv')

In [21]:
df.shape

(1000, 9)

In [25]:
df.columns
df = df[['sex', 'degree', 'salary', 'age', 'target']]
df.head(6)

Unnamed: 0,sex,degree,salary,age,target
0,Male,MSc,10000.0,20.0,1
1,Female,PhD,2000.0,24.0,0
2,Male,,,34.0,1
3,Female,PhD,30000.0,,1
4,Male,,25000.0,41.0,0
5,Female,PhD,10000.0,20.0,1


In [26]:
df.isnull().sum()

sex        15
degree     18
salary    200
age       200
target      0
dtype: int64

# Train test Split

In [27]:
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pipeline

In [76]:
numeric_transformer = Pipeline(steps=[    
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaling',MinMaxScaler()),
])

categorical_transformer = Pipeline(steps=[    
    ('imputer', SimpleImputer(strategy='most_frequent')),    
    ('encoder', OneHotEncoder(drop='first'))])

ordinal_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(categories=[['MSc', 'PhD']]))
])

"""
If I want the same sequence for the transformed data need to 
maintain a sequence of the steps here.
That's mandatory
If I use the num first then in the transformed data salary and age
will be the first two columns
"""
preprocessor = ColumnTransformer(transformers=[    
    ('cat', categorical_transformer, ['sex']),
    ('ord', ordinal_transformer, ['degree']),
    ('num', numeric_transformer, ['salary', 'age']),
], remainder='passthrough')

In [77]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier())])

In [78]:
pipeline.fit(X_train, y_train)

In [79]:
y_pred = pipeline.predict(X_test)

# Performance measure

In [80]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.51      0.62      0.56       101
           1       0.51      0.39      0.44        99

    accuracy                           0.51       200
   macro avg       0.51      0.51      0.50       200
weighted avg       0.51      0.51      0.50       200



# See the output of each step of the pipeline

# Compare the df and transformed df

In [96]:
column_names = ['sex', 'degree', 'salary', 'age']
X_train_transformed = preprocessor.fit_transform(X_train)
transformed_df = pd.DataFrame(X_train_transformed, columns=column_names)
print(transformed_df.head(5))
print("===================================")
print(X_train.head(5))

   sex  degree    salary       age
0  0.0     1.0  0.821429  1.000000
1  0.0     1.0  0.285714  0.000000
2  0.0     1.0  0.285714  0.000000
3  0.0     1.0  0.522803  0.666667
4  1.0     0.0  0.000000  0.190476
        sex degree   salary   age
29   Female    PhD  25000.0  41.0
535  Female    PhD  10000.0  20.0
695  Female    PhD  10000.0  20.0
557  Female    PhD      NaN  34.0
836    Male    MSc   2000.0  24.0


## Transormed sex and degree column

In [86]:
transformed_df['sex'].value_counts()

1.0    411
0.0    389
Name: sex, dtype: int64

In [87]:
transformed_df['degree'].value_counts()

0.0    415
1.0    385
Name: degree, dtype: int64

## Original sex and degree column

In [88]:
X_train['sex'].value_counts()

Male      398
Female    389
Name: sex, dtype: int64

In [89]:
X_train['degree'].value_counts()

MSc    398
PhD    385
Name: degree, dtype: int64

## Salary column of original and transfromed data

In [84]:
transformed_df['salary'].describe()

count    800.000000
mean       0.522803
std        0.360687
min        0.000000
25%        0.285714
50%        0.522803
75%        0.821429
max        1.000000
Name: salary, dtype: float64

In [90]:
X_train['salary'].describe()

count      639.000000
mean     16638.497653
std      11301.891328
min       2000.000000
25%       2000.000000
50%      10000.000000
75%      30000.000000
max      30000.000000
Name: salary, dtype: float64