# Ticket Analytics Milestones Notebook

This notebook orchestrates Milestone 1 cleaning and feature engineering.


In [None]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Ensure project root (containing `src/`) is on sys.path
CWD = os.path.abspath(os.getcwd())
BASE_DIR = CWD
for candidate in [CWD, os.path.dirname(CWD)]:
    if os.path.isdir(os.path.join(candidate, "src")):
        BASE_DIR = candidate
        if candidate not in sys.path:
            sys.path.append(candidate)
        break

from src.schema import detect_schema

plt.style.use('ggplot')
# Paths relative to detected project root
RAW_PATH = os.path.join(BASE_DIR, 'data', 'raw', 'customer_support_tickets.csv')
PROCESSED_DIR = os.path.join(BASE_DIR, 'data', 'processed')
FIG_DIR = os.path.join(BASE_DIR, 'outputs', 'figures')
REPORT_DIR = os.path.join(BASE_DIR, 'reports')

os.makedirs(PROCESSED_DIR, exist_ok=True)
os.makedirs(FIG_DIR, exist_ok=True)
os.makedirs(REPORT_DIR, exist_ok=True)


## Milestone 1 / Module 1: Initialization & Dataset Setup

Load the raw CSV, inspect schema/dtypes/missingness, and detect canonical column names using src.schema.detect_schema.


In [None]:
# Load raw dataset
df_raw = pd.read_csv(RAW_PATH)
print('Raw shape:', df_raw.shape)
print('Columns:', df_raw.columns.tolist())
print('\nDtypes:\n', df_raw.dtypes)
print('\nMissing (top 20):\n', df_raw.isna().sum().sort_values(ascending=False).head(20))

# Detect logical columns
schema = detect_schema(df_raw)
print('\nDetected schema mapping:')
for k, v in schema.as_dict().items():
    print(f'- {k}: {v}')

# Initial ticket distributions (type, priority, category)
def save_bar(series, title, filename, top_n=15):
    s = series.dropna().astype(str).str.strip()
    vc = s.value_counts().head(top_n)
    plt.figure()
    vc.sort_values().plot(kind='bar')
    plt.title(title)
    plt.xlabel('Value')
    plt.ylabel('Count')
    plt.tight_layout()
    plt.savefig(os.path.join(FIG_DIR, filename), dpi=200)
    plt.close()

if schema.type:
    save_bar(df_raw[schema.type], 'Ticket Distribution by Type', 'm1_type_distribution.png')
if schema.priority:
    save_bar(df_raw[schema.priority], 'Ticket Distribution by Priority', 'm1_priority_distribution.png')
if schema.category:
    save_bar(df_raw[schema.category], 'Ticket Distribution by Category', 'm1_category_distribution.png')


## Milestone 1 / Module 2: Cleaning & Feature Engineering

Run the robust cleaning + feature engineering pipeline from src.milestone1_pipeline to produce processed datasets and documentation.


In [None]:
# Run milestone 1 pipeline (module executes on import)
import src.milestone1_pipeline  # noqa: F401

# Load features dataset for downstream modules
features_path = os.path.join(PROCESSED_DIR, 'tickets_features.csv')
df_feat = pd.read_csv(features_path)
schema_feat = detect_schema(df_feat)
print('Features shape:', df_feat.shape)
print('Schema on features:')
for k, v in schema_feat.as_dict().items():
    print(f'- {k}: {v}')
