## Best Practices for Data Preprocessing

#### Always Explore & Visualize Data First

In [2]:
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def validate_dataframe(df):\n",
    "    \"\"\"\n",
    "    Validates the dataframe by checking for the necessary columns and correct data types.\n",
    "\n",
    "    Args:\n",
    "        df (pd.DataFrame): The dataframe to validate.\n",
    "    \n",
    "    Returns:\n",
    "        bool: True if the dataframe is valid, raises ValueError otherwise.\n",
    "    \"\"\"\n",
    "    required_columns = ['Age', 'Gender', 'Income']\n",
    "    if not all(col in df.columns for col in required_columns):\n",
    "        raise ValueError(f\"Missing required columns: {', '.join(required_columns)}\")\n",
    "    \n",
    "    if not np.issubdtype(df['Age'].dtype, np.number) or not np.issubdtype(df['Income'].dtype, np.number):\n",
    "        raise ValueError(\"Columns 'Age' and 'Income' should be numeric.\")\n",
    "    \n",
    "    if not np.issubdtype(df['Gender'].dtype, object):  # Change np.object to object\n",
    "        raise ValueError(\"Column 'Gender' should be categorical.\")\n",
    "\n",
    "    return True\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_13424/3320633278.py:44: FutureWarning: In the future `np.object` will be defined as the corresponding NumPy scalar.\n",
      "  if not np.issubdtype(df['Gender'].dtype, np.object):\n"
     ]
    },
    {
     "ename": "AttributeError",
     "evalue": "module 'numpy' has no attribute 'object'.\n`np.object` was a deprecated alias for the builtin `object`. To avoid this error in existing code, use `object` by itself. Doing this will not modify any behavior and is safe. \nThe aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:\n    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[3], line 160\u001b[0m\n\u001b[1;32m    157\u001b[0m df \u001b[38;5;241m=\u001b[39m load_data()\n\u001b[1;32m    159\u001b[0m \u001b[38;5;66;03m# Apply preprocessing pipeline\u001b[39;00m\n\u001b[0;32m--> 160\u001b[0m preprocessed_data \u001b[38;5;241m=\u001b[39m \u001b[43mapply_pipeline\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    161\u001b[0m \u001b[38;5;28mprint\u001b[39m(preprocessed_data)\n",
      "Cell \u001b[0;32mIn[3], line 139\u001b[0m, in \u001b[0;36mapply_pipeline\u001b[0;34m(df)\u001b[0m\n\u001b[1;32m    129\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    130\u001b[0m \u001b[38;5;124;03mApplies the preprocessing pipeline to the given dataframe.\u001b[39;00m\n\u001b[1;32m    131\u001b[0m \n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    136\u001b[0m \u001b[38;5;124;03m    pd.DataFrame: The preprocessed dataframe.\u001b[39;00m\n\u001b[1;32m    137\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    138\u001b[0m \u001b[38;5;66;03m# Validate the dataframe\u001b[39;00m\n\u001b[0;32m--> 139\u001b[0m \u001b[43mvalidate_dataframe\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    141\u001b[0m \u001b[38;5;66;03m# Split the data into features and target (if applicable)\u001b[39;00m\n\u001b[1;32m    142\u001b[0m X \u001b[38;5;241m=\u001b[39m df\u001b[38;5;241m.\u001b[39mdrop(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mIncome\u001b[39m\u001b[38;5;124m'\u001b[39m, axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)  \u001b[38;5;66;03m# Assuming 'Income' is the target, adjust as needed\u001b[39;00m\n",
      "Cell \u001b[0;32mIn[3], line 44\u001b[0m, in \u001b[0;36mvalidate_dataframe\u001b[0;34m(df)\u001b[0m\n\u001b[1;32m     41\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m np\u001b[38;5;241m.\u001b[39missubdtype(df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mAge\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mdtype, np\u001b[38;5;241m.\u001b[39mnumber) \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m np\u001b[38;5;241m.\u001b[39missubdtype(df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mIncome\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mdtype, np\u001b[38;5;241m.\u001b[39mnumber):\n\u001b[1;32m     42\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mColumns \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mAge\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m and \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mIncome\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m should be numeric.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 44\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m np\u001b[38;5;241m.\u001b[39missubdtype(df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mGender\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mdtype, \u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mobject\u001b[49m):\n\u001b[1;32m     45\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mColumn \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mGender\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m should be categorical.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m     47\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m\n",
      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/numpy/__init__.py:324\u001b[0m, in \u001b[0;36m__getattr__\u001b[0;34m(attr)\u001b[0m\n\u001b[1;32m    319\u001b[0m     warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[1;32m    320\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIn the future `np.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mattr\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m` will be defined as the \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    321\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcorresponding NumPy scalar.\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;167;01mFutureWarning\u001b[39;00m, stacklevel\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m2\u001b[39m)\n\u001b[1;32m    323\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m attr \u001b[38;5;129;01min\u001b[39;00m __former_attrs__:\n\u001b[0;32m--> 324\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mAttributeError\u001b[39;00m(__former_attrs__[attr])\n\u001b[1;32m    326\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m attr \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtesting\u001b[39m\u001b[38;5;124m'\u001b[39m:\n\u001b[1;32m    327\u001b[0m     \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mnumpy\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtesting\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mtesting\u001b[39;00m\n",
      "\u001b[0;31mAttributeError\u001b[0m: module 'numpy' has no attribute 'object'.\n`np.object` was a deprecated alias for the builtin `object`. To avoid this error in existing code, use `object` by itself. Doing this will not modify any behavior and is safe. \nThe aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:\n    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
    "from sklearn.impute import SimpleImputer\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "# Step 1: Load a sample dataset\n",
    "def load_data():\n",
    "    \"\"\"\n",
    "    Load a sample dataset for preprocessing. The dataset contains both numerical and categorical columns.\n",
    "\n",
    "    Returns:\n",
    "        pd.DataFrame: The loaded dataset.\n",
    "    \"\"\"\n",
    "    # Example dataset: Replace this with your actual data loading step\n",
    "    data = {\n",
    "        'Age': [25, np.nan, 30, 35, np.nan],\n",
    "        'Gender': ['M', 'F', 'M', 'F', 'M'],\n",
    "        'Income': [50000, 60000, 55000, np.nan, 70000]\n",
    "    }\n",
    "    df = pd.DataFrame(data)\n",
    "    return df\n",
    "\n",
    "# Step 2: Check input dataframe\n",
    "def validate_dataframe(df):\n",
    "    \"\"\"\n",
    "    Validates the dataframe by checking for the necessary columns and correct data types.\n",
    "\n",
    "    Args:\n",
    "        df (pd.DataFrame): The dataframe to validate.\n",
    "    \n",
    "    Returns:\n",
    "        bool: True if the dataframe is valid, raises ValueError otherwise.\n",
    "    \"\"\"\n",
    "    required_columns = ['Age', 'Gender', 'Income']\n",
    "    if not all(col in df.columns for col in required_columns):\n",
    "        raise ValueError(f\"Missing required columns: {', '.join(required_columns)}\")\n",
    "    \n",
    "    if not np.issubdtype(df['Age'].dtype, np.number) or not np.issubdtype(df['Income'].dtype, np.number):\n",
    "        raise ValueError(\"Columns 'Age' and 'Income' should be numeric.\")\n",
    "    \n",
    "    if not np.issubdtype(df['Gender'].dtype, np.object):\n",
    "        raise ValueError(\"Column 'Gender' should be categorical.\")\n",
    "\n",
    "    return True\n",
    "\n",
    "# Step 3: Imputation Function\n",
    "def impute_data(df):\n",
    "    \"\"\"\n",
    "    Fills missing values in the dataframe using mean imputation for numerical columns.\n",
    "\n",
    "    Args:\n",
    "        df (pd.DataFrame): The dataframe with missing values.\n",
    "\n",
    "    Returns:\n",
    "        pd.DataFrame: The dataframe with missing values imputed.\n",
    "    \"\"\"\n",
    "    numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns\n",
    "    imputer = SimpleImputer(strategy='mean')\n",
    "    df[numerical_cols] = imputer.fit_transform(df[numerical_cols])\n",
    "    return df\n",
    "\n",
    "# Step 4: Encoding Categorical Data\n",
    "def encode_categorical(df):\n",
    "    \"\"\"\n",
    "    Encodes categorical columns using OneHotEncoder.\n",
    "\n",
    "    Args:\n",
    "        df (pd.DataFrame): The dataframe with categorical columns.\n",
    "\n",
    "    Returns:\n",
    "        pd.DataFrame: The dataframe with encoded categorical columns.\n",
    "    \"\"\"\n",
    "    # Check if 'Gender' column exists\n",
    "    if 'Gender' not in df.columns:\n",
    "        raise ValueError(\"Column 'Gender' is missing in the dataframe.\")\n",
    "    \n",
    "    one_hot_encoder = OneHotEncoder(sparse=False, drop='first')\n",
    "    gender_encoded = one_hot_encoder.fit_transform(df[['Gender']])\n",
    "    gender_encoded_df = pd.DataFrame(gender_encoded, columns=one_hot_encoder.get_feature_names_out(['Gender']))\n",
    "    df = df.join(gender_encoded_df).drop('Gender', axis=1)\n",
    "    return df\n",
    "\n",
    "# Step 5: Feature Scaling (Standardization)\n",
    "def scale_features(df):\n",
    "    \"\"\"\n",
    "    Scales the numerical features of the dataframe using StandardScaler.\n",
    "\n",
    "    Args:\n",
    "        df (pd.DataFrame): The dataframe with numerical columns to scale.\n",
    "\n",
    "    Returns:\n",
    "        pd.DataFrame: The dataframe with scaled features.\n",
    "    \"\"\"\n",
    "    numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns\n",
    "    scaler = StandardScaler()\n",
    "    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])\n",
    "    return df\n",
    "\n",
    "# Step 6: Build a Preprocessing Pipeline\n",
    "def build_pipeline():\n",
    "    \"\"\"\n",
    "    Builds a preprocessing pipeline to impute missing values, encode categorical data, \n",
    "    and scale numerical features.\n",
    "\n",
    "    Returns:\n",
    "        sklearn.pipeline.Pipeline: The complete preprocessing pipeline.\n",
    "    \"\"\"\n",
    "    # Column transformer to apply different transformations on different columns\n",
    "    preprocessor = ColumnTransformer(\n",
    "        transformers=[\n",
    "            ('num', SimpleImputer(strategy='mean'), ['Age', 'Income']),\n",
    "            ('cat', OneHotEncoder(sparse=False, drop='first'), ['Gender'])\n",
    "        ]\n",
    "    )\n",
    "    \n",
    "    # Create a pipeline with preprocessing steps\n",
    "    pipeline = Pipeline(steps=[\n",
    "        ('preprocessor', preprocessor),\n",
    "        ('scaler', StandardScaler())\n",
    "    ])\n",
    "    \n",
    "    return pipeline\n",
    "\n",
    "# Step 7: Apply the Preprocessing Pipeline\n",
    "def apply_pipeline(df):\n",
    "    \"\"\"\n",
    "    Applies the preprocessing pipeline to the given dataframe.\n",
    "\n",
    "    Args:\n",
    "        df (pd.DataFrame): The dataframe to preprocess.\n",
    "\n",
    "    Returns:\n",
    "        pd.DataFrame: The preprocessed dataframe.\n",
    "    \"\"\"\n",
    "    # Validate the dataframe\n",
    "    validate_dataframe(df)\n",
    "    \n",
    "    # Split the data into features and target (if applicable)\n",
    "    X = df.drop('Income', axis=1)  # Assuming 'Income' is the target, adjust as needed\n",
    "    y = df['Income']\n",
    "    \n",
    "    # Build and apply the pipeline\n",
    "    pipeline = build_pipeline()\n",
    "    X_transformed = pipeline.fit_transform(X)\n",
    "    \n",
    "    # Return the transformed data\n",
    "    transformed_df = pd.DataFrame(X_transformed, columns=['Age', 'Gender_M'])\n",
    "    transformed_df['Income'] = y\n",
    "    return transformed_df\n",
    "\n",
    "# Example usage\n",
    "if __name__ == \"__main__\":\n",
    "    # Load the data\n",
    "    df = load_data()\n",
    "    \n",
    "    # Apply preprocessing pipeline\n",
    "    preprocessed_data = apply_pipeline(df)\n",
    "    print(preprocessed_data)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}

{'cells': [{'cell_type': 'code',
   'execution_count': 2,
   'metadata': {},
   'outputs': [],
   'source': ['def validate_dataframe(df):\n',
    '    """\n',
    '    Validates the dataframe by checking for the necessary columns and correct data types.\n',
    '\n',
    '    Args:\n',
    '        df (pd.DataFrame): The dataframe to validate.\n',
    '    \n',
    '    Returns:\n',
    '        bool: True if the dataframe is valid, raises ValueError otherwise.\n',
    '    """\n',
    "    required_columns = ['Age', 'Gender', 'Income']\n",
    '    if not all(col in df.columns for col in required_columns):\n',
    '        raise ValueError(f"Missing required columns: {\', \'.join(required_columns)}")\n',
    '    \n',
    "    if not np.issubdtype(df['Age'].dtype, np.number) or not np.issubdtype(df['Income'].dtype, np.number):\n",
    '        raise ValueError("Columns \'Age\' and \'Income\' should be numeric.")\n',
    '    \n',
    "    if not np.issubdtype(df['Gender'].dtype, objec

## Handle Missing & Inconsistent Data Before Applying ML Models

In [None]:
# Task 4: Drop Missing Values





# Task 5: Fill Missing Values




# Task 6: Handling Outliers with Capping





## Choose the Right Scaling Method

In [None]:
# Task 7: Min-Max Scaling







# Task 8: Robust Scaling






# Task 9: MaxAbs Scaling






## Keep Track of Data Transformations for Reproducibility

In [None]:
# Task 10: Log Data Preprocessing Steps






# Task 11: Store Transformation Parameters




