# Feature_Pipeline

In [8]:
import pandas as pd
import logging

from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from feature_engine.datetime import DatetimeFeatures
from feature_engine.selection import SmartCorrelatedSelection, RecursiveFeatureElimination
from feature_engine.timeseries.forecasting import LagFeatures, WindowFeatures, ExpandingWindowFeatures

### Logging Configuration
logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    datefmt= '%d-%b(%m)-%Y %I:%M:%S',
)
logger = logging.getLogger(__name__)

#Lodding Data
def getData(path: str) -> pd.DataFrame:
    try:
        df = pd.read_csv(path)
        return df
    except Exception as e:
        logger.error(f"in getData(): {e}")

def dataCleaning() -> None:
    global df
    try:
        df['timestamp'] = pd.to_datetime(df.tpep_pickup_datetime)
        df.drop(columns=['tpep_pickup_datetime'], inplace= True)
        df.drop_duplicates(subset= ['timestamp'], inplace=True)
        df = df[~(df.timestamp > pd.Timestamp('2022-12-31 00:00:00'))]
    except Exception as e:
        logger.error(f'in dataCleaning(): {e}')
        
# featureEngineering start here
def add_temporal_features() -> None:
    global df
    try:
        features_to_extract = [
            "month", "quarter", "semester", "year", "week", "day_of_week", "day_of_month",
            "day_of_year", "weekend", "month_start", "month_end", "quarter_start",
            "quarter_end", "year_start", "year_end", "leap_year", "days_in_month", "hour", "minute", "second"
        ]
        temporal = DatetimeFeatures(features_to_extract=features_to_extract).fit_transform(df[['timestamp']])
        for col in temporal.columns:
            df.loc[:, col] = temporal[col].values
    except Exception as e:
        logger.error(f'in add_temporal_features(): {e}')

def add_lag_features() -> None:
    global df
    try:
        lagfeatures = LagFeatures(variables=None, periods=[1, 2, 4, 8, 16, 24], freq=None, sort_index=True,
                                missing_values='raise', drop_original=False)
        lagfeatures.fit(df[['timestamp', 'passenger_demand', 'taxi_demand']])
        features = lagfeatures.transform(df[['timestamp', 'passenger_demand', 'taxi_demand']])
        for col in list(features.columns)[3:]:
            df[col] = features[col].values
    except Exception as e:
        logger.error(f'in The add_lag_features(): {e}')
        
def add_window_features() -> None:
    global df
    try:
        window = WindowFeatures(
            variables=None, window=7, min_periods=1,
            functions=['mean', 'std', 'median'], periods=1, freq=None, sort_index=True,
            missing_values='raise', drop_original=False
        )
        window.fit(df[['timestamp', 'passenger_demand', 'taxi_demand']])
        features = window.fit_transform(df[['timestamp', 'passenger_demand', 'taxi_demand']])
        for col in list(features.columns)[3:]:
            df[col] = features[col].values
    except Exception as e:
        logger.error(f'in add_window_features(): {e}')
        
def add_exp_window_features() -> None:
    global df
    try:
        expwindow = ExpandingWindowFeatures(
            variables=None, min_periods=None, functions='std',
            periods=1, freq=None, sort_index=True,
            missing_values='raise', drop_original=False
        )
        expwindow.fit(df[['timestamp', 'passenger_demand', 'taxi_demand']])
        features = expwindow.fit_transform(df[['timestamp', 'passenger_demand', 'taxi_demand']])
        for col in list(features.columns)[3:]:
            df[col] = features[col].values
    except Exception as e:
        logger.error(f'in add_exp_window_features(): {e}')
        
# Feture Selection Start here
def select_best_features():
    global df
    try:
        X = df.drop(columns=['timestamp','passenger_demand', 'taxi_demand'])
        y = df['taxi_demand']
        scs = SmartCorrelatedSelection(
            variables=None, method='pearson', threshold=0.5,
            missing_values='ignore', selection_method='variance',
            confirm_variables=False
        )
        scs_columns = set(scs.fit_transform(X).columns)
        rfe = RecursiveFeatureElimination(
            DecisionTreeRegressor(max_depth=3), scoring='r2', cv=3, threshold=0.01,
            variables=None, confirm_variables=False
        )
        rfe_columns = rfe.fit_transform(X, y)
        scs_columns.update(rfe_columns)
        df = df[list(scs_columns)]
        df['taxi_demand'] = y
    except Exception as e:
        logger.error(f'in select_best_features(): {e}')

# Data Scaling here
def normalizeScaling() -> None:
    global df
    try:
        scaler = StandardScaler()
        scaler.fit(df.drop(columns=['taxi_demand',]))
        df.loc[:, df.columns[:-1]] = scaler.transform(df.drop(columns=['taxi_demand',])) 
    except Exception as e:
        logger.error(f"in normalizeScaling(): {e}")
        
# DimensonalRedaction start from here
def reduceDimensionality() -> None:
    global df
    try:
        features = df.drop(columns=['taxi_demand'])
        target = df['taxi_demand']
        pca = PCA(n_components=19)
        features_reduced = pca.fit_transform(features)
        df = pd.DataFrame(features_reduced, columns=[f'PC{i}' for i in range(1, 20)])
        df['taxi_demand'] = target
    except Exception as e:
        logger.error(f"in reduceDimensionality(): {e}")
        
# Now Time to call the all function and save it 
def preprocessFeatures():
    global df
    try:
        add_temporal_features()
        add_lag_features()
        add_window_features()
        add_exp_window_features()
        df.dropna(axis=0,inplace=True)
        if df is None or df.empty:
            raise ValueError("DataFrame is None or empty after dropping missing values.")
        ### call other steps
        select_best_features()
        normalizeScaling()
        reduceDimensionality()
    except Exception as e:
        logger.error(f'in preprocessFeatures(): {e}')


if __name__ == '__main__':
    #Loading data
    df = getData(r'../data/2022.csv')
    # Get the cleaned data
    dataCleaning()
    # Get processed data with feature selection
    preprocessFeatures()
    if df is not None:
        # Save the processed data
        output_file_path = r"C:/Users/SRA/Desktop/backup/C/MLgrit/time_series_project/uber-taxi-demand/data/featurePipelineFinalData.parquet"
        df.dropna(axis=0, inplace=True)
        df.to_parquet(output_file_path, index=False)
        print(f"data has been saved successfully!")
    else:
        print("No valid processed data to save.")

# Traning_Pipeline

In [7]:
import pandas as pd
import mlflow.sklearn
import mlflow
import warnings
import xgboost as xgb
import logging

from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error, r2_score

warnings.filterwarnings('ignore')

logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    datefmt='%d-%b(%m)-%Y %I:%M:%S',
)
logger = logging.getLogger(__name__)

#Loading data
def getData(path: str) -> pd.DataFrame:
    try:
        df = pd.read_parquet(path)
        return df
    except Exception as e:
        logger.error(f"in getData(): {e}")

def splitting() -> tuple:
    global df
    try:
        X = df.drop(columns=["taxi_demand",])
        y = df.taxi_demand
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        return X_train, X_test, y_train, y_test
    except Exception as e:
        logger.error(f'in splitting(): {e}')

def myModelxgb(X_train, X_test, y_train, y_test) -> xgb.XGBRegressor:
    try:
        mlflow.set_experiment("TimeSeries")
        with mlflow.start_run():
            x_model = xgb.XGBRegressor()
            param_dist = {
                'max_depth': randint(1, 16),
                'n_estimators': randint(100, 600),
                'min_child_weight': randint(1, 16),
                'gamma': [0, 0.1, 0.2],
                'colsample_bytree': [0.7, 0.8, 0.9],
                'nthread': randint(1, 16),
            }
            # run a randomized search
            n_iter_search = 20
            random_search = RandomizedSearchCV(x_model, param_distributions=param_dist,
                                               n_iter=n_iter_search, random_state=42)
            # fit the model
            random_search.fit(X_train, y_train)
            # Predict on the test set using the best estimator from the grid search
            y_pred = random_search.best_estimator_.predict(X_test)
            
            # Log parameters 
            mlflow.log_params(random_search.best_params_)
            # Calculate and log the evaluation metric (e.g., RMSE)
            rmse = mean_squared_error(y_test, y_pred, squared=False)
            mape = mean_absolute_percentage_error(y_test, y_pred)
            mae = mean_absolute_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)

            #Log Matrics
            mlflow.log_metrics({
                "RMSE": rmse,
                "MAE": mae,
                "MAPE0": mape,
                "R2_SCORE": r2
            })

            # Saving the best model obtained after hyperparameter tuning
            mlflow.sklearn.log_model(random_search.best_estimator_, 'XGBoost_best_model')

            return random_search.best_estimator_
    except Exception as e:
        logger.error(f"in myModelxgb(): {e}")

if __name__ == '__main__':
    df = getData(r'../data/featurePipelineFinalData.parquet')
    X_train, X_test, y_train, y_test = splitting()
    best_model = myModelxgb(X_train, X_test, y_train, y_test)
    # we have to use 'best_model' for further predictions or inference

In [17]:
df = pd.read_parquet(r'../data/feature-2023.parquet')
df.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,taxi_demand
0,23.718273,14.930842,25.355853,8.588989,6.786783,-10.252492,-6.724879,-3.978018,-4.377113,0.737261,-15.359071,-0.060347,3.371094e-18,-1.369342e-18,-1.577304e-18,4.7663130000000004e-18,2.482778e-18,2.035194e-18,2.0221349999999998e-19,
1,3.807196,-0.70294,-2.078457,-0.502498,-0.952596,-1.075451,-0.169224,0.961579,-0.357883,-1.981192,-0.292281,2.143428,-3.267452e-16,1.856613e-16,2.516799e-16,-6.4788e-16,-1.491158e-16,-3.659323e-16,-1.940145e-16,
2,12.390297,4.29484,4.633114,-0.524657,-2.017603,4.93614,5.132366,-6.536679,-7.906023,-2.232191,8.96539,0.441244,3.222898e-17,-1.141686e-16,4.548973e-16,-6.967821e-16,2.830278e-16,-4.624935e-16,-1.881244e-16,
3,7.319921,1.029894,-0.167156,-0.922772,-1.066377,1.126567,0.672173,4.652307,5.595849,1.065017,-0.067521,1.039021,-5.239083e-16,4.357634e-16,-3.399934e-16,9.088027000000001e-17,-4.793232e-16,2.390657e-17,-1.896852e-16,
4,18.320177,9.328241,14.014048,3.330964,1.054496,-1.995706,0.691989,-1.674017,-2.040926,-1.95249,22.957588,-0.450683,-1.1057850000000002e-17,5.503029000000001e-17,-2.298146e-16,3.555405e-16,-1.377897e-16,2.342995e-16,9.436554000000001e-17,
