In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler, FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv('data/Fake.csv')
df.head()

Unnamed: 0,c1,c2,c3,c4,sex,degree,salary,age,target
0,-1.607711,-0.029398,1.569953,-0.527984,Male,MSc,10000.0,20.0,1
1,0.290131,0.317681,-0.996519,,Female,PhD,2000.0,24.0,0
2,-1.225096,0.895911,-0.900324,0.719861,Male,,,34.0,1
3,2.046861,-1.680595,1.923718,-1.427244,Female,PhD,30000.0,,1
4,,-0.289261,-1.049151,0.206271,Male,,25000.0,41.0,0


In [3]:
df.shape

(1000, 9)

In [4]:
df.columns

Index(['c1', 'c2', 'c3', 'c4', 'sex', 'degree', 'salary', 'age', 'target'], dtype='object')

In [5]:
df.isnull().sum()

c1         10
c2         11
c3         10
c4          9
sex        15
degree     18
salary    200
age       200
target      0
dtype: int64

# Train test Split

In [6]:
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pipeline

In [27]:
numeric_transformer = Pipeline(steps=[    
    ('imputer', SimpleImputer(strategy='mean')),    
    ('scaler', MinMaxScaler())])

categorical_transformer = Pipeline(steps=[    
    ('imputer', SimpleImputer(strategy='most_frequent')),    
    ('encoder', OneHotEncoder(drop='first'))])

ordinal_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(categories=[['MSc', 'PhD']]))
])


preprocessor = ColumnTransformer(transformers=[    
    ('num', numeric_transformer, ['c1', 'c2', 'c3', 'c4', 'salary', 'age']),
    ('cat', categorical_transformer, ['sex']),
    ('ord', ordinal_transformer, ['degree'])
], remainder='passthrough')

In [28]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier())])

In [29]:
pipeline.fit(X_train, y_train)

In [30]:
y_pred = pipeline.predict(X_test)

# Performance measure

In [31]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.89      0.95      0.92       101
           1       0.95      0.88      0.91        99

    accuracy                           0.92       200
   macro avg       0.92      0.91      0.91       200
weighted avg       0.92      0.92      0.91       200



# See the output of each step of the pipeline

In [33]:
# X_train_transformed = pipeline.named_steps['preprocessor'].transform(X_train)
# transformed_df = pd.DataFrame(X_train_transformed, columns = df.columns)
# transformed_df.head(2)

# Compare the df and transformed df

In [34]:
X_train['sex'].value_counts()

Male      398
Female    389
Name: sex, dtype: int64

In [35]:
transformed_df['sex'].value_counts()

0.285714    164
1.000000    164
0.522803    161
0.000000    161
0.821429    150
Name: sex, dtype: int64

In [36]:
empty_index = X_train[X_train['degree'].isnull()].index
X_train.loc[empty_index]['degree'].head(5)

165    NaN
2      NaN
7      NaN
173    NaN
6      NaN
Name: degree, dtype: object

In [37]:
transformed_df.loc[empty_index]['degree'].head(5)

165    0.45283
2      0.00000
7      0.45283
173    0.45283
6      0.00000
Name: degree, dtype: float64

In [38]:
X_train.shape

(800, 8)

In [39]:
transformed_df.shape

(800, 9)

In [40]:
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(categories=[['MSc', 'PhD']]))
])

In [42]:
# df['degree'] = pipeline.fit_transform(df[['degree']])
# df['degree'] = df['degree'].astype(np.int64)
# df['degree'].value_counts()

In [52]:
column_names = ['c1', 'c2', 'c3', 'c4', 'salary', 'age', 'sex', 'degree']
X_train_transformed = preprocessor.fit_transform(X_test)
transformed_df = pd.DataFrame(X_train_transformed, columns = column_names)
transformed_df.head(2)

Unnamed: 0,c1,c2,c3,c4,salary,age,sex,degree
0,0.113016,0.653877,0.5796,0.520242,0.0,0.190476,0.0,1.0
1,0.39877,0.484845,0.623803,0.408856,0.542591,0.666667,0.0,1.0


In [53]:
transformed_df['degree'].nunique()

2