In [11]:
import os
import glob
import json
import math
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path


OUT_DIR = Path('notebooks/outputs')
OUT_DIR.mkdir(parents=True, exist_ok=True)
pd.set_option('display.max_columns', 120)
plt.rcParams['figure.dpi'] = 120

In [17]:
processed_files = sorted(glob.glob('data/processed/*.jsonl'))
if not processed_files:
    print('No processed JSONL files found in data/processed/.')


records = []
for p in processed_files:
    with open(p, 'r', encoding='utf-8') as fh:
        for line in fh:
            try:
                records.append(json.loads(line))
            except json.JSONDecodeError:
                print(f'Warning: skipping malformed line in {p}')


if records:
    df = pd.DataFrame(records)
    df['description_len'] = df['description'].apply(lambda x: len(x) if x else 0)
    df['num_comments'] = df['comments'].apply(lambda cs: len(cs) if isinstance(cs, list) else 0)
    df.head()
else:
    df = pd.DataFrame()
    print('No records loaded.')

No processed JSONL files found in data/processed/.
No records loaded.


In [18]:
try:
    from validators.schema import IssueRecord
    PYDANTIC_AVAILABLE = True
except Exception:
    PYDANTIC_AVAILABLE = False


if PYDANTIC_AVAILABLE and not df.empty:
    sample_size = min(100, len(df))
    sample_idx = df.index.tolist()[:sample_size]
    errors = []
    for idx in sample_idx:
        rec = df.loc[idx].to_dict()
        try:
            IssueRecord.parse_obj(rec)
        except Exception as e:
            errors.append((idx, str(e)))
    print(f'Validation sample size: {sample_size}, errors found: {len(errors)}')
else:
    print('Pydantic not available or no data loaded; skipping schema validation.')

Pydantic not available or no data loaded; skipping schema validation.


In [14]:
if not df.empty:
    plt.figure(figsize=(8,3))
    df['issuetype'].value_counts().plot(kind='bar')
    plt.title('Issue Types')
    plt.tight_layout()
    plt.savefig(OUT_DIR / 'issue_types.png')
    plt.show()


    plt.figure(figsize=(8,3))
    df['status'].value_counts().plot(kind='bar')
    plt.title('Status Distribution')
    plt.tight_layout()
    plt.savefig(OUT_DIR / 'status_distribution.png')
    plt.show()


    plt.figure(figsize=(8,4))
    df['description_len'].hist(bins=40)
    plt.title('Description Length')
    plt.tight_layout()
    plt.savefig(OUT_DIR / 'description_length.png')
    plt.show()

In [15]:
if not df.empty:
    print('Total records:', len(df))
    print('Records with empty description:', (df['description_len']==0).sum())


    df['info_score'] = df.apply(lambda row: 2*math.log1p(row['description_len']) + math.log1p(row['num_comments']), axis=1)
    threshold = df['info_score'].quantile(0.15)
    low_info = df[df['info_score'] <= threshold]
    print('Low-information records (bottom 15%):', len(low_info))

In [16]:
if not df.empty:
    html_path = Path('notebooks/report.html')
    df.to_html(html_path)
    print('HTML report saved to', html_path)
else:
    print('No data to export.')

No data to export.
