In [10]:
#1
# import and paths
import os
import pandas as pd
from src.data_utils import clean_data, quick_health_report

RAW_PATH   = "../data/raw/prs.csv"
CLEAN_DIR  = "../data/clean"
os.makedirs(CLEAN_DIR, exist_ok=True)
CSV_PATH = os.path.join(CLEAN_DIR, "prs_clean.csv")
PARQ_PATH = os.path.join(CLEAN_DIR, "prs_clean.parquet")

os.makedirs(CLEAN_DIR, exist_ok=True)

print("Paths set")



Paths set


In [11]:
#2
# load raw csv
if not os.path.exists(RAW_PATH):
    raise FileNotFoundError(f"Raw file missing at {RAW_PATH}")

df = pd.read_csv(RAW_PATH)
print(f"Loaded raw rows: {len(df)}")
display(df.head(3))

Loaded raw rows: 100


Unnamed: 0,id,number,title,author,body,created_at,state,labels,reviewers
0,2801771679,55290,Fixed rows_processor in SQLInsertRowsOperator,dabla,While testing the new providers in Airflow 3.0...,2025-09-05T07:43:33Z,closed,area:providers;provider:common-sql,eladkal
1,2801526484,55288,Remove python_callable as string from mapped o...,jedcunningham,Remove string serialization of pythoncallable ...,2025-09-05T05:29:19Z,closed,area:serialization,jedcunningham
2,2801402406,55286,add stable note to BatchExecutor,eladkal,looks like I lost the stable note in changelog...,2025-09-05T03:57:57Z,closed,provider:amazon;area:providers;kind:documentation,eladkal


In [12]:
#3
# check for missing and duplicate values
print("Basic info:")
display(df.info())

print("\nMissing values per column:")
display(df.isna().sum())

if "id" in df.columns:
    print("Duplicate id count:", df["id"].duplicated().sum())
if "number" in df.columns:
    print("Duplicate number count:", df["number"].duplicated().sum())

if "created_at" in df.columns:
    print("created_at range (raw):", df["created_at"].min(), "→", df["created_at"].max())


Basic info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          100 non-null    int64 
 1   number      100 non-null    int64 
 2   title       100 non-null    object
 3   author      100 non-null    object
 4   body        99 non-null     object
 5   created_at  100 non-null    object
 6   state       100 non-null    object
 7   labels      100 non-null    object
 8   reviewers   100 non-null    object
dtypes: int64(2), object(7)
memory usage: 7.2+ KB


None


Missing values per column:


id            0
number        0
title         0
author        0
body          1
created_at    0
state         0
labels        0
reviewers     0
dtype: int64

Duplicate id count: 1
Duplicate number count: 1
created_at range (raw): 2025-08-28T18:21:39Z → 2025-09-05T07:43:33Z


In [17]:
# 4
# clean the data so that it is suitable for further analysis.
df_clean = clean_data(
    df,
    remove_self_reviews=True,   # drop self-reviews (author == reviewer) per your choice
    split_lists=True,           # parse semicolon-lists into lists
    lowercase_tokens=True,      # normalize tokens if you want
    trim_tokens=True
)

report = quick_health_report(df_clean)
print("Health report after cleaning:")
display(report)

print("\nCleaned rows:", len(df_clean))
display(df_clean[["author", "body_clean", "labels_list", "keyphrases", "reviewers_list"]].head())


Health report after cleaning:


{'rows': 53,
 'cols': 13,
 'duplicate_ids': 0,
 'duplicate_numbers': 0,
 'total_nulls': 0,
 'unparseable_dates': 0,
 'date_min': '2025-08-29 01:17:04+00:00',
 'date_max': '2025-09-05 07:43:33+00:00',
 'avg_labels_per_pr': 1.8113207547169812,
 'avg_reviewers_per_pr': 1.0}


Cleaned rows: 53


Unnamed: 0,author,body_clean,labels_list,keyphrases,reviewers_list
0,dabla,While testing the new providers in Airflow 3.0...,"[area:providers, provider:common-sql]",the new providers | Airflow 3.0.6 | I | an iss...,[eladkal]
7,tirkarthi,closes #55251,[area:ui],,[pierrejeambrun]
8,amoghrajesh,That isn't right ^ --- ^ Add meaningful descri...,"[area:dev-tools, backport-to-v3-0-test]",That | meaningful description | more informati...,[potiuk]
10,sjyangkevin,Close: #55054 Reproduce the error The task fai...,"[area:providers, area:triggerer, provider:stan...",the error | The task | the conflict error | Th...,[lee-w]
14,ferruzzi,Simple banner on the docs page. Screenshot of ...,[kind:documentation],Simple banner | the docs page | the rendered r...,[potiuk]


In [19]:
# 5
# check for missing columns
expected_cols = {"author", "labels_list", "reviewers_list", "created_at", "body_clean"}
missing = [c for c in expected_cols if c not in df_clean.columns]
if missing:
    raise AssertionError(f"Missing expected columns after cleaning: {missing}")

# need to confirm if the lists are Python lists
sample_i = df_clean.index[0]
assert isinstance(df_clean.loc[sample_i, "labels_list"], list), "labels_list should be a list"
assert isinstance(df_clean.loc[sample_i, "reviewers_list"], list), "reviewers_list should be a list"
# assuring the datetime is not string but actual datetime
assert pd.api.types.is_datetime64_any_dtype(df_clean["created_at"]), "created_at must be datetime64[ns, UTC]"
print("Sanity checks passed ✅")

Sanity checks passed ✅


In [20]:
# 6
# date range of data considered
df['created_at'].min(), df['created_at'].max()

('2025-08-28T18:21:39Z', '2025-09-05T07:43:33Z')

In [21]:
# 7
# save CSV (human-readable)
df_clean.to_csv(CSV_PATH, index=False)

# save Parquet (preserves datatypes & lists)
df_clean.to_parquet(PARQ_PATH, index=False)

print(f"Saved CSV  → {CSV_PATH}  (rows: {len(df_clean)})")
print(f"Saved Parquet → {PARQ_PATH}  (rows: {len(df_clean)})")

Saved CSV  → ../data/clean/prs_clean.csv  (rows: 53)
Saved Parquet → ../data/clean/prs_clean.parquet  (rows: 53)


In [22]:
# 8
# create an exploded author–reviewer pairs CSV used by EDA notebook
PAIRS_PATH = f"{CLEAN_DIR}/prs_pairs.csv"
pairs = (
    df_clean[["id","number","author","reviewers_list","created_at"]]
    .explode("reviewers_list")
    .rename(columns={"reviewers_list":"reviewer"})
    .dropna(subset=["reviewer"])
    .reset_index(drop=True)
)
pairs.to_csv(PAIRS_PATH, index=False)
print(f"Saved pairs → {PAIRS_PATH} (rows: {len(pairs)}, unique reviewers: {pairs['reviewer'].nunique()})")
display(pairs.head(5))

Saved pairs → ../data/clean/prs_pairs.csv (rows: 53, unique reviewers: 14)


Unnamed: 0,id,number,author,reviewer,created_at
0,2801771679,55290,dabla,eladkal,2025-09-05 07:43:33+00:00
1,2799915276,55269,tirkarthi,pierrejeambrun,2025-09-04 15:35:43+00:00
2,2799094678,55261,amoghrajesh,potiuk,2025-09-04 11:43:18+00:00
3,2797986433,55243,sjyangkevin,lee-w,2025-09-04 04:52:39+00:00
4,2797544900,55235,ferruzzi,potiuk,2025-09-03 23:41:52+00:00


In [23]:
print("Data load & clean complete.")
print(f"Cleaned PRs: {len(df_clean)}")
print("Preview columns:", list(df_clean.columns))


Data load & clean complete.
Cleaned PRs: 53
Preview columns: ['id', 'number', 'title', 'author', 'body', 'created_at', 'state', 'labels', 'reviewers', 'labels_list', 'reviewers_list', 'body_clean', 'keyphrases']
