In [36]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.preprocessing import MinMaxScaler, StandardScaler,OrdinalEncoder,LabelEncoder,OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin 

scaler_minmax= MinMaxScaler()                                                # Create a MinMaxScaler object
scaler_standered=StandardScaler()                                            # Create a StandardScaler object
oe=OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)      # Create a OrdenalEncoder object
le=LabelEncoder()                                                            # Create a LabelEncoder object
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')            # Create a OneHotEncoder object
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from helper_function import extract_year_from_date_record_column

ohe= OneHotEncoder(sparse_output=False, handle_unknown='ignore')
# Create synthetic data
data = {
    'gps_height': [1, 2, 3, 4, 5, 6, 7, 8, 10, -1000, 1200, 1500, 1800],  # Outliers
    'longitude': [30, 30.2, 29.8, 30.1, 30.5, 30.3, 29.9, 30.4, 30.2, -200, 300, 400, -180],  # Outliers added
    'latitude': [-6, -5.5, -6.2, -6.1, -5.9, -6, -6.3, -5.8, -6, -100, 150, 200, -120],  # Outliers
    'date_recorded': [
        '2011-01-01', '2012-06-15', '2013-03-20', '2011-09-09',
        '2012-02-02', '2013-12-31', '2014-05-10', '2012-08-08',
        '2011-11-11', '2014-07-07', '2011-04-04', '2013-10-10', '2014-12-25'
    ]
}
df=pd.DataFrame(data)
class YearExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.feature_name_ = 'date_recorded'
        return self

    def transform(self, X):
        years = pd.to_datetime(X.iloc[:, 0]).dt.year
        years = years.where(years.isin([2011, 2012, 2013]), 2011)
        return pd.DataFrame(years, columns=[self.feature_name_])

    def get_feature_names_out(self, input_features=None):
        return [self.feature_name_]


    
    
    
class IQRCapper(BaseEstimator, TransformerMixin):
    def __init__(self, multiplier=1.5):
        self.multiplier = multiplier

    def fit(self, X, y=None):
        self.feature_name = X.columns[0]
        X_series = X.iloc[:, 0]
        self.q1_ = X_series.quantile(0.25)
        self.q3_ = X_series.quantile(0.75)
        self.iqr_ = self.q3_ - self.q1_
        self.lower_bound_ = self.q1_ - self.multiplier * self.iqr_
        self.upper_bound_ = self.q3_ + self.multiplier * self.iqr_
        return self

    def transform(self, X):
        X_series = X.iloc[:, 0]
        X_capped = X_series.clip(self.lower_bound_, self.upper_bound_)
        return pd.DataFrame(X_capped, columns=[self.feature_name])

    def get_feature_names_out(self, input_features=None):
        return [self.feature_name]

date_recorded_transformer_pipeline=Pipeline([
    
    ('year_extractor',YearExtractor()),
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore',drop='first'))
])


# Now use the modified transformer in your pipeline
oulier_minmax_pipeline = Pipeline(steps=[
    ('iqr_cap', IQRCapper())
    #('scaler', MinMaxScaler())
])

# ColumnTransformer and full pipeline setup
preprocessor = ColumnTransformer(
    transformers=[
        ('date', date_recorded_transformer_pipeline, ['date_recorded']),
        #('gps_height', scaler_minmax, ['gps_height']),
        ('outlier_minmax1', oulier_minmax_pipeline, ['gps_height']),
        ('outlier_minmax2', oulier_minmax_pipeline, ['longitude']),
        ('outlier_minmax3', oulier_minmax_pipeline, ['latitude'])

    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

preprocess_pipeline = Pipeline([
    ('preprocessing', preprocessor)
    
])

preprocess_pipeline.fit(df)
df= preprocess_pipeline.transform(df)

# Extract feature names
columns = preprocess_pipeline.named_steps['preprocessing'].get_feature_names_out()

# Create DataFrame with proper column names
df = pd.DataFrame(df, columns=columns)
df

Unnamed: 0,date_recorded_2012,date_recorded_2013,gps_height,longitude,latitude
0,0.0,0.0,1.0,30.0,-6.0
1,1.0,0.0,2.0,30.2,-5.5
2,0.0,1.0,3.0,29.8,-6.2
3,0.0,0.0,4.0,30.1,-6.1
4,1.0,0.0,5.0,30.5,-5.9
5,0.0,1.0,6.0,30.3,-6.0
6,0.0,0.0,7.0,29.9,-6.3
7,1.0,0.0,8.0,30.4,-5.8
8,0.0,0.0,10.0,30.2,-6.0
9,0.0,0.0,-7.5,29.15,-6.8


In [37]:
df.columns

Index(['date_recorded_2012', 'date_recorded_2013', 'gps_height', 'longitude',
       'latitude'],
      dtype='object')