In [1]:
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Data Drift Detection Results:\n",
      "Age: No Drift\n",
      "Salary: No Drift\n",
      "\n",
      "Data Quality Checks Results:\n",
      "Missing Values: Series([], dtype: int64)\n",
      "Duplicates: 0\n",
      "Outliers: 0\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from scipy.stats import ks_2samp\n",
    "\n",
    "# Sample data for training and current dataset\n",
    "train_data = {\n",
    "    'Age': [25, 30, 35, 40, 45, 50, 55],\n",
    "    'Salary': [50000, 60000, 70000, 80000, 90000, 100000, 110000]\n",
    "}\n",
    "\n",
    "current_data = {\n",
    "    'Age': [26, 31, 36, 41, 46, 51, 60],\n",
    "    'Salary': [51000, 61000, 71000, 81000, 91000, 101000, 105000]\n",
    "}\n",
    "\n",
    "train_df = pd.DataFrame(train_data)\n",
    "current_df = pd.DataFrame(current_data)\n",
    "\n",
    "# Step 1: Detect Data Drift using KS-test (Kolmogorov-Smirnov test)\n",
    "def detect_data_drift(train_df, current_df):\n",
    "    drift_results = {}\n",
    "    \n",
    "    # Loop through columns in the training data\n",
    "    for column in train_df.columns:\n",
    "        # Perform KS-test to compare distributions of train and current data\n",
    "        statistic, p_value = ks_2samp(train_df[column], current_df[column])\n",
    "        \n",
    "        # If p_value < 0.05, it indicates significant drift\n",
    "        drift_results[column] = \"Drift Detected\" if p_value < 0.05 else \"No Drift\"\n",
    "    \n",
    "    return drift_results\n",
    "\n",
    "# Step 2: Data Quality Checks\n",
    "def data_quality_checks(df):\n",
    "    quality_issues = {}\n",
    "    \n",
    "    # Check for missing values\n",
    "    missing_values = df.isnull().sum()\n",
    "    quality_issues['Missing Values'] = missing_values[missing_values > 0]\n",
    "    \n",
    "    # Check for duplicate rows\n",
    "    duplicates = df.duplicated().sum()\n",
    "    quality_issues['Duplicates'] = duplicates\n",
    "    \n",
    "    # Check for outliers using Z-score\n",
    "    z_scores = np.abs((df - df.mean()) / df.std())\n",
    "    outliers = (z_scores > 3).sum()\n",
    "    quality_issues['Outliers'] = outliers.sum()\n",
    "    \n",
    "    return quality_issues\n",
    "\n",
    "# Step 3: Visualize the Drift (Optional)\n",
    "def visualize_drift(train_df, current_df):\n",
    "    # Visualize histograms for each column to compare distributions\n",
    "    for column in train_df.columns:\n",
    "        plt.figure(figsize=(10, 6))\n",
    "        sns.histplot(train_df[column], color='blue', label='Training Data', kde=True, stat='density', linewidth=0)\n",
    "        sns.histplot(current_df[column], color='red', label='Current Data', kde=True, stat='density', linewidth=0)\n",
    "        plt.title(f'Distribution Comparison for {column}')\n",
    "        plt.legend()\n",
    "        plt.show()\n",
    "\n",
    "# Step 4: Execute Drift Detection and Data Quality Checks\n",
    "drift_results = detect_data_drift(train_df, current_df)\n",
    "quality_issues = data_quality_checks(current_df)\n",
    "\n",
    "# Display the results\n",
    "print(\"Data Drift Detection Results:\")\n",
    "for feature, drift in drift_results.items():\n",
    "    print(f\"{feature}: {drift}\")\n",
    "\n",
    "print(\"\\nData Quality Checks Results:\")\n",
    "for issue, count in quality_issues.items():\n",
    "    print(f\"{issue}: {count}\")\n",
    "\n",
    "# Visualize Drift\n",
    "\n",
    "\n",
    "# Optimized version of data quality checks with better error handling and vectorization\n",
    "def data_quality_checks_optimized(df):\n",
    "    quality_issues = {}\n",
    "    \n",
    "    # Check for missing values - vectorized operation\n",
    "    missing_values = df.isnull().sum()\n",
    "    if missing_values.any():\n",
    "        quality_issues['Missing Values'] = missing_values[missing_values > 0]\n",
    "    \n",
    "    # Check for duplicate rows - efficient without looping\n",
    "    duplicates = df.duplicated().sum()\n",
    "    if duplicates > 0:\n",
    "        quality_issues['Duplicates'] = duplicates\n",
    "    \n",
    "    # Outlier detection using vectorized z-score calculation\n",
    "    z_scores = np.abs((df.select_dtypes(include=[np.number]) - df.mean()) / df.std())\n",
    "    outliers = (z_scores > 3).sum()  # count the number of outliers\n",
    "    if outliers.any():\n",
    "        quality_issues['Outliers'] = outliers.sum()\n",
    "    \n",
    "    # Return collected quality issues\n",
    "    return quality_issues\n",
    "\n",
    "# Error handling when performing KS-test for drift detection\n",
    "def safe_ks_test(train_series, current_series):\n",
    "    try:\n",
    "        statistic, p_value = ks_2samp(train_series, current_series)\n",
    "        return statistic, p_value\n",
    "    except Exception as e:\n",
    "        print(f\"Error in KS-test for columns: {e}\")\n",
    "        return None, None\n",
    "\n",
    "# Example of applying the error-safe KS-test for drift detection\n",
    "def detect_data_drift_with_error_handling(train_df, current_df):\n",
    "    drift_results = {}\n",
    "    for column in train_df.columns:\n",
    "        if column in current_df.columns:\n",
    "            statistic, p_value = safe_ks_test(train_df[column], current_df[column])\n",
    "            if p_value is not None:\n",
    "                drift_results[column] = \"Drift Detected\" if p_value < 0.05 else \"No Drift\"\n",
    "        else:\n",
    "            drift_results[column] = \"Column missing in current data\"\n",
    "    \n",
    "    return drift_results"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}

{'cells': [{'cell_type': 'code',
   'execution_count': 6,
   'metadata': {},
   'outputs': [{'name': 'stdout',
     'output_type': 'stream',
     'text': ['Data Drift Detection Results:\n',
      'Age: No Drift\n',
      'Salary: No Drift\n',
      '\n',
      'Data Quality Checks Results:\n',
      'Missing Values: Series([], dtype: int64)\n',
      'Duplicates: 0\n',
      'Outliers: 0\n']}],
   'source': ['import pandas as pd\n',
    'import numpy as np\n',
    'import matplotlib.pyplot as plt\n',
    'import seaborn as sns\n',
    'from scipy.stats import ks_2samp\n',
    '\n',
    '# Sample data for training and current dataset\n',
    'train_data = {\n',
    "    'Age': [25, 30, 35, 40, 45, 50, 55],\n",
    "    'Salary': [50000, 60000, 70000, 80000, 90000, 100000, 110000]\n",
    '}\n',
    '\n',
    'current_data = {\n',
    "    'Age': [26, 31, 36, 41, 46, 51, 60],\n",
    "    'Salary': [51000, 61000, 71000, 81000, 91000, 101000, 105000]\n",
    '}\n',
    '\n',
    'train_df

In [None]:
# Part 2: Automating Data Quality Checks
# Objective: Use Python and data quality frameworks to automate validation.

# Task 1: Setting Up Automated Validation with Python

# Task 2: Introduction to Great Expectations: Install the great_expectations package and set up a basic project.

# Task 3: Creating Expectations with Great Expectations: Use Great Expectations to define data validation expectations for a dataset.


