In [25]:
# Install Snorkel if not already installed
import warnings
warnings.filterwarnings('ignore', category=UserWarning)

# Import Snorkel modules for labeling and denoising
from snorkel.labeling import labeling_function, PandasLFApplier, LabelingFunction
from snorkel.labeling.model import LabelModel
from snorkel.labeling.model.baselines import MajorityLabelVoter

# Jira Project Risk Labeling

This notebook uses Snorkel to label projects based on schedule and quality risk indicators. The notebook includes options to customize the labeling process.

In [26]:
# Configuration Settings

# Whether to use the label model for denoising (True) or just use majority voting (False)
USE_LABEL_MODEL = False  # Set to False to use only majority voting
# Note: Label model only works with 3+ labeling functions

# Define constants for Snorkel labels
ABSTAIN = -1
LOW_RISK = 0
HIGH_RISK = 1

# Threshold settings
QUANTILE_THRESHOLD = 0.7  # 70th percentile for feature thresholds
PROBABILITY_THRESHOLD = 0.5  # Threshold for converting probabilities to binary labels

print(f"Configuration: Use Label Model = {USE_LABEL_MODEL}")
print(f"Quantile Threshold = {QUANTILE_THRESHOLD}, Probability Threshold = {PROBABILITY_THRESHOLD}")

Configuration: Use Label Model = False
Quantile Threshold = 0.7, Probability Threshold = 0.5


In [27]:
import pandas as pd

# Load training data
df_train = pd.read_csv('./apache/train.csv')
print(f"Training data shape: {df_train.shape}")

# Load test data
df_test = pd.read_csv('./apache/test.csv')
print(f"Test data shape: {df_test.shape}")

df_train.head()

Training data shape: (492, 45)
Test data shape: (124, 45)


Unnamed: 0,project_id,project_key,project_name,project_category,total_reopened,average_lifespan,average_workspan,average_change_density,average_reassignment_rate,average_description_edit_rate,...,num_priority_high,num_priority_low,num_priority_medium,num_priority_other,num_issue_type_other,num_issue_type_feature,num_issue_type_improvement,num_issue_type_bug,num_issue_type_task,num_high_priority_bugs
0,12316921,CONTINUUM,Continuum,"Build, Release, & Project Management",135,183.388674,718.095796,344.106902,0.062784,0.004675,...,193,516,1995,0,1,297,639,1533,234,161
1,12310732,JSPWIKI,JSPWiki,Web Frameworks & UI,95,322.902468,1607.497204,72.102385,0.053574,0.024022,...,48,567,457,0,0,74,285,593,120,43
2,12310493,TRANSACTION,Commons Transaction,Libraries & Utilities (Commons & Core),2,242.3761,502.277625,7.01497,0.062955,0.010582,...,4,10,28,0,0,6,10,25,1,4
3,12323721,MWRAPPER,Maven Wrapper,"Build, Release, & Project Management",15,155.086469,248.868168,80.239918,0.049586,0.037538,...,6,28,122,1,0,12,39,80,26,5
4,12310740,HTTPDRAFT,Labs WebArch draft-fielding-http (Retired),Incubator & Retired/Dormant Projects,0,13.550386,38.347902,8.975719,0.112834,0.0,...,0,2,6,174,0,0,70,112,0,0


In [28]:
from prep import preprocess_data

# Preprocess both training and test data
df_train = preprocess_data(df_train)
df_test = preprocess_data(df_test)

In [29]:
# from matplotlib import pyplot as plt
# import seaborn as sns
# from upsetplot import plot as upset_plot
# from upsetplot import from_indicators
import numpy as np

# Define schedule risk indicators (features)
schedule_risk_indicators = [
    # 'average_lifespan',
    # 'average_reassignment_rate', 
    'incomplete_ratio',
]

# Calculate z-scores for each indicator to standardize them - ON TRAIN DATA ONLY
for col in schedule_risk_indicators:
    # Calculate mean and std from training data only
    train_mean = df_train[col].mean()
    train_std = df_train[col].std()
    
    # Apply standardization to both train and test using training statistics
    df_train[f'z_{col}'] = (df_train[col] - train_mean) / train_std
    df_test[f'z_{col}'] = (df_test[col] - train_mean) / train_std

# Use the quantile threshold from configuration
quantile_threshold = QUANTILE_THRESHOLD

from snorkel.labeling import labeling_function

# Dynamically create labeling functions for each schedule risk indicator
def make_high_indicator_lf(indicator, threshold_value):
    @labeling_function(name=f"lf_high_{indicator}")
    def lf(x):
        return HIGH_RISK if x[indicator] > threshold_value else LOW_RISK
    return lf

# Create labeling functions using thresholds from training data only
schedule_lfs = []
for ind in schedule_risk_indicators:
    threshold = df_train[ind].quantile(quantile_threshold)
    schedule_lfs.append(make_high_indicator_lf(ind, threshold))

# Create combined indicator LF using thresholds from training data
z_cols = [f'z_{col}' for col in schedule_risk_indicators]
combined_threshold = df_train[z_cols].mean(axis=1).quantile(quantile_threshold)

@labeling_function(name="lf_combined_indicators")
def lf_combined_indicators(x):
    z_score_mean = np.mean([x[f'z_{col}'] for col in schedule_risk_indicators])
    return HIGH_RISK if z_score_mean > combined_threshold else LOW_RISK

schedule_lfs.append(lf_combined_indicators)

# Apply labeling functions to the training dataframe
lf_applier = PandasLFApplier(schedule_lfs)
L_schedule_train = lf_applier.apply(df_train)

# Print the coverage for each LF on training data
print("Labeling Functions Coverage (Training Data):")
for i, lf in enumerate(schedule_lfs):
    cov = (L_schedule_train[:, i] != ABSTAIN).mean()
    print(f"{lf.name}: {cov:.3f}")

# Initialize majority voter for comparison
mv = MajorityLabelVoter()
df_train['schedule_risk_majority_vote'] = mv.predict(L_schedule_train)

# Conditional label model training based on configuration
if USE_LABEL_MODEL:
    # Train the label model to denoise and combine the labeling functions
    print("\nTraining label model for schedule risk...")
    label_model = LabelModel(cardinality=2, verbose=True)
    label_model.fit(L_schedule_train, n_epochs=500, log_freq=100, seed=42)
    
    # Apply the trained model to get the denoised labels for training data
    schedule_risk_proba_train = label_model.predict_proba(L_schedule_train)
    df_train['schedule_risk_score'] = schedule_risk_proba_train[:, 1]  # Probability of HIGH_RISK
    
    # Use the probability threshold from configuration
    df_train['schedule_risk'] = (df_train['schedule_risk_score'] >= PROBABILITY_THRESHOLD).astype(int)
else:
    # Use majority voting for labels if not using label model
    print("\nUsing majority voting for schedule risk...")
    df_train['schedule_risk'] = df_train['schedule_risk_majority_vote']
    
    # For consistency, create a risk score column (binary 0 or 1)
    df_train['schedule_risk_score'] = df_train['schedule_risk'].astype(float)

# Now apply the same labeling functions to the test data
L_schedule_test = lf_applier.apply(df_test)

# For test data, apply the same approach as training based on configuration
df_test['schedule_risk_majority_vote'] = mv.predict(L_schedule_test)

if USE_LABEL_MODEL:
    # Apply the trained model to test data
    schedule_risk_proba_test = label_model.predict_proba(L_schedule_test)
    df_test['schedule_risk_score'] = schedule_risk_proba_test[:, 1]  # Probability of HIGH_RISK
    df_test['schedule_risk'] = (df_test['schedule_risk_score'] >= PROBABILITY_THRESHOLD).astype(int)
else:
    # Use majority voting for test data too if not using label model
    df_test['schedule_risk'] = df_test['schedule_risk_majority_vote']
    df_test['schedule_risk_score'] = df_test['schedule_risk'].astype(float)

# Display the risk distribution for training data
labeling_method = "Denoised Labels" if USE_LABEL_MODEL else "Majority Vote"
print(f"\nSchedule Risk Distribution ({labeling_method}) - Training Data:")
print(df_train['schedule_risk'].value_counts(normalize=True))

print("\nSchedule Risk Distribution (Majority Vote) - Training Data:")
print(df_train['schedule_risk_majority_vote'].value_counts(normalize=True))

# Display the risk distribution for test data
print(f"\nSchedule Risk Distribution ({labeling_method}) - Test Data:")
print(df_test['schedule_risk'].value_counts(normalize=True))

print("\nSchedule Risk Distribution (Majority Vote) - Test Data:")
print(df_test['schedule_risk_majority_vote'].value_counts(normalize=True))

# Analyze agreement between labeling functions on training data
def lf_summary(L, lfs):
    """Returns a summary of labeling function statistics"""
    d = {
        "j": list(range(len(lfs))),
        "Labeling Function": [lf.name for lf in lfs],
        "Polarity": ["+" if lf.name.startswith('lf_high') else "-" for lf in lfs],
        "Coverage": [(L[:, i] != ABSTAIN).mean() for i in range(L.shape[1])],
        "Overlaps": [
            sum((L[:, i] != ABSTAIN) & (L[:, j] != ABSTAIN)) / len(L) 
            for i, j in zip(range(L.shape[1]), range(-1, L.shape[1]-1))
        ],
        "Conflicts": [
            sum((L[:, i] != ABSTAIN) & (L[:, j] != ABSTAIN) & (L[:, i] != L[:, j])) / len(L) 
            for i, j in zip(range(L.shape[1]), range(-1, L.shape[1]-1))
        ],
    }
    return pd.DataFrame(d)

# Print summary statistics for labeling functions on training data
print("\nLabeling Function Summary (Training Data):")
print(lf_summary(L_schedule_train, schedule_lfs))

# Transform Snorkel matrix to binary indicators for training data
df_lf_flags = pd.DataFrame()
lf_names = [lf.name for lf in schedule_lfs]
for i, name in enumerate(lf_names):
    df_lf_flags[name] = (L_schedule_train[:, i] == HIGH_RISK)

# dump df_fl_flags to a CSV file for further analysis
df_lf_flags.to_csv("./apache/schedule_risk_lf_flags.csv", index=False)

# upset_data = from_indicators(df_lf_flags)

# plt.figure(figsize=(12, 8))
# upset_plot(
#     upset_data,
#     sort_by='cardinality',
#     show_counts=True,
#     subset_size='count'
# )
# plt.suptitle("Overlap of Schedule Risk Labeling Functions", fontsize=14)
# plt.tight_layout()
# plt.show()

100%|██████████| 492/492 [00:00<00:00, 8827.88it/s]


Labeling Functions Coverage (Training Data):
lf_high_incomplete_ratio: 1.000
lf_combined_indicators: 1.000

Using majority voting for schedule risk...


100%|██████████| 124/124 [00:00<00:00, 10439.04it/s]


Schedule Risk Distribution (Majority Vote) - Training Data:
0    0.699187
1    0.300813
Name: schedule_risk, dtype: float64

Schedule Risk Distribution (Majority Vote) - Training Data:
0    0.699187
1    0.300813
Name: schedule_risk_majority_vote, dtype: float64

Schedule Risk Distribution (Majority Vote) - Test Data:
0    0.75
1    0.25
Name: schedule_risk, dtype: float64

Schedule Risk Distribution (Majority Vote) - Test Data:
0    0.75
1    0.25
Name: schedule_risk_majority_vote, dtype: float64

Labeling Function Summary (Training Data):
   j         Labeling Function Polarity  Coverage  Overlaps  Conflicts
0  0  lf_high_incomplete_ratio        +       1.0       1.0        0.0
1  1    lf_combined_indicators        -       1.0       1.0        0.0





In [30]:
# Define quality risk indicators
quality_risk_indicators = [
    # 'num_issue_type_bug_ratio',
    'high_priority_bug_ratio',
    # 'reopen_ratio',
    # 'bug_to_development_ratio',
    # 'inverse_fix_rate',
]

# Check if we have sufficient indicators
if len(quality_risk_indicators) == 0:
    print("No quality risk indicators available in the dataset")
else:
    # Calculate z-scores for each indicator based on training data only
    for col in quality_risk_indicators:
        # Calculate mean and std from training data
        train_mean = df_train[col].mean()
        train_std = df_train[col].std()
        
        # Apply to both train and test using training statistics
        df_train[f'z_{col}'] = (df_train[col] - train_mean) / train_std
        df_test[f'z_{col}'] = (df_test[col] - train_mean) / train_std

    # Use the quantile threshold from configuration
    quantile_threshold = QUANTILE_THRESHOLD

    # Define labeling functions for quality risk using thresholds from training data
    def make_high_indicator_lf(indicator, threshold_value):
        @labeling_function(name=f"lf_high_{indicator}")
        def lf(x):
            return HIGH_RISK if x[indicator] > threshold_value else LOW_RISK
        return lf

    # Create LFs for each indicator using thresholds from training data
    quality_lfs = []
    for col in quality_risk_indicators:
        threshold = df_train[col].quantile(quantile_threshold)
        quality_lfs.append(make_high_indicator_lf(col, threshold))

    # Add a combined z-score LF if there are at least 2 indicators
    if len(quality_risk_indicators) > 1:
        # Calculate threshold from training data
        z_cols = [f'z_{col}' for col in quality_risk_indicators]
        combined_threshold = df_train[z_cols].mean(axis=1).quantile(quantile_threshold)
        
        @labeling_function()
        def lf_combined_quality(x):
            z_score_mean = np.mean([x[f'z_{col}'] for col in quality_risk_indicators])
            return HIGH_RISK if z_score_mean > combined_threshold else LOW_RISK
        
        quality_lfs.append(lf_combined_quality)

    # Apply labeling functions to the training dataframe
    lf_applier = PandasLFApplier(quality_lfs)
    L_quality_train = lf_applier.apply(df_train)

    # Print the coverage for each LF on training data
    print("Labeling Functions Coverage (Training Data):")
    for i, lf in enumerate(quality_lfs):
        cov = (L_quality_train[:, i] != ABSTAIN).mean()
        print(f"{lf.name}: {cov:.3f}")

    # Initialize majority voter for comparison
    mv = MajorityLabelVoter()
    df_train['quality_risk_majority_vote'] = mv.predict(L_quality_train)

    # Conditional label model training based on configuration
    if USE_LABEL_MODEL:
        # Train the label model to denoise and combine the labeling functions
        print("\nTraining label model for quality risk...")
        label_model = LabelModel(cardinality=2, verbose=True)
        label_model.fit(L_quality_train, n_epochs=500, log_freq=100, seed=42)
        
        # Apply the trained model to get the denoised labels for training data
        quality_risk_proba_train = label_model.predict_proba(L_quality_train)
        df_train['quality_risk_score'] = quality_risk_proba_train[:, 1]  # Probability of HIGH_RISK
        
        # Use the probability threshold from configuration
        df_train['quality_risk'] = (df_train['quality_risk_score'] >= PROBABILITY_THRESHOLD).astype(int)
    else:
        # Use majority voting for labels if not using label model
        print("\nUsing majority voting for quality risk...")
        df_train['quality_risk'] = df_train['quality_risk_majority_vote']
        
        # For consistency, create a risk score column (binary 0 or 1)
        df_train['quality_risk_score'] = df_train['quality_risk'].astype(float)

    # Now apply the same labeling functions to the test data
    L_quality_test = lf_applier.apply(df_test)

    # For test data, apply the same approach as training based on configuration
    df_test['quality_risk_majority_vote'] = mv.predict(L_quality_test)

    if USE_LABEL_MODEL:
        # Apply the trained model to test data
        quality_risk_proba_test = label_model.predict_proba(L_quality_test)
        df_test['quality_risk_score'] = quality_risk_proba_test[:, 1]  # Probability of HIGH_RISK
        df_test['quality_risk'] = (df_test['quality_risk_score'] >= PROBABILITY_THRESHOLD).astype(int)
    else:
        # Use majority voting for test data too if not using label model
        df_test['quality_risk'] = df_test['quality_risk_majority_vote']
        df_test['quality_risk_score'] = df_test['quality_risk'].astype(float)

    # Display the risk distribution for training data
    labeling_method = "Denoised Labels" if USE_LABEL_MODEL else "Majority Vote"
    print(f"\nQuality Risk Distribution ({labeling_method}) - Training Data:")
    print(df_train['quality_risk'].value_counts(normalize=True))

    print("\nQuality Risk Distribution (Majority Vote) - Training Data:")
    print(df_train['quality_risk_majority_vote'].value_counts(normalize=True))

    # Display the risk distribution for test data
    print(f"\nQuality Risk Distribution ({labeling_method}) - Test Data:")
    print(df_test['quality_risk'].value_counts(normalize=True))

    print("\nQuality Risk Distribution (Majority Vote) - Test Data:")
    print(df_test['quality_risk_majority_vote'].value_counts(normalize=True))

    # Analyze agreement between labeling functions on training data
    print("\nLabeling Function Summary (Training Data):")
    print(lf_summary(L_quality_train, quality_lfs))

    # Transform Snorkel matrix to binary indicators for training data
    df_lf_flags = pd.DataFrame()
    lf_names = [lf.name for lf in quality_lfs]
    for i, name in enumerate(lf_names):
        df_lf_flags[name] = (L_quality_train[:, i] == HIGH_RISK)

    # dump df_fl_flags to a CSV file for further analysis
    df_lf_flags.to_csv("./apache/quality_risk_lf_flags.csv", index=False)

  0%|          | 0/492 [00:00<?, ?it/s]

100%|██████████| 492/492 [00:00<00:00, 27292.65it/s]


Labeling Functions Coverage (Training Data):
lf_high_high_priority_bug_ratio: 1.000

Using majority voting for quality risk...


100%|██████████| 124/124 [00:00<00:00, 13070.96it/s]



Quality Risk Distribution (Majority Vote) - Training Data:
0    0.699187
1    0.300813
Name: quality_risk, dtype: float64

Quality Risk Distribution (Majority Vote) - Training Data:
0    0.699187
1    0.300813
Name: quality_risk_majority_vote, dtype: float64

Quality Risk Distribution (Majority Vote) - Test Data:
0    0.725806
1    0.274194
Name: quality_risk, dtype: float64

Quality Risk Distribution (Majority Vote) - Test Data:
0    0.725806
1    0.274194
Name: quality_risk_majority_vote, dtype: float64

Labeling Function Summary (Training Data):
   j                Labeling Function Polarity  Coverage  Overlaps  Conflicts
0  0  lf_high_high_priority_bug_ratio        +       1.0       1.0        0.0


In [31]:
# Calculate combined risk for both training and test sets
df_train['combined_risk'] = df_train.apply(lambda x: 
                                f"S{x['schedule_risk']}Q{x['quality_risk']}", 
                                axis=1)

df_test['combined_risk'] = df_test.apply(lambda x: 
                                f"S{x['schedule_risk']}Q{x['quality_risk']}", 
                                axis=1)

risk_mapping = {
    'S0Q0': 'Low Risk', 
    'S0Q1': 'Quality Risk Only', 
    'S1Q0': 'Schedule Risk Only',
    'S1Q1': 'High Risk'
}

df_train['risk_category'] = df_train['combined_risk'].map(risk_mapping)
df_test['risk_category'] = df_test['combined_risk'].map(risk_mapping)

# Display risk distribution for training data
print("Risk Category Distribution (Training Data):")
print(df_train['risk_category'].value_counts())

# Display risk distribution for test data
print("\nRisk Category Distribution (Test Data):")
print(df_test['risk_category'].value_counts())

Risk Category Distribution (Training Data):
Low Risk              229
Schedule Risk Only    115
Quality Risk Only     115
High Risk              33
Name: risk_category, dtype: int64

Risk Category Distribution (Test Data):
Low Risk              65
Quality Risk Only     28
Schedule Risk Only    25
High Risk              6
Name: risk_category, dtype: int64


In [32]:
# Extract risk labels for both training and test datasets
train_risk_labels_df = df_train[[
    'project_id', 
    'schedule_risk', 'schedule_risk_score', 'schedule_risk_majority_vote',
    'quality_risk', 'quality_risk_score', 'quality_risk_majority_vote',
    'combined_risk', 'risk_category',
]]

test_risk_labels_df = df_test[[
    'project_id', 
    'schedule_risk', 'schedule_risk_score', 'schedule_risk_majority_vote',
    'quality_risk', 'quality_risk_score', 'quality_risk_majority_vote',
    'combined_risk', 'risk_category',
]]

# Save to CSV
train_risk_labels_df.to_csv("./apache/train_project_risk_labels.csv", index=False)
test_risk_labels_df.to_csv("./apache/test_project_risk_labels.csv", index=False)
print("Risk labels saved to './apache/train_project_risk_labels.csv' and './apache/test_project_risk_labels.csv'")

# You can also save the combined data if needed
all_risk_labels_df = pd.concat([
    train_risk_labels_df.assign(dataset='train'), 
    test_risk_labels_df.assign(dataset='test')
])
all_risk_labels_df.to_csv("./apache/all_project_risk_labels.csv", index=False)
print("Combined risk labels saved to './apache/all_project_risk_labels.csv'")

Risk labels saved to './apache/train_project_risk_labels.csv' and './apache/test_project_risk_labels.csv'
Combined risk labels saved to './apache/all_project_risk_labels.csv'


# How to Use Different Labeling Methods

To change the labeling method:

1. Go to the "Configuration Settings" cell near the top of this notebook
2. Set `USE_LABEL_MODEL = True` to use the Snorkel Label Model for denoising
3. Set `USE_LABEL_MODEL = False` to use only majority voting
4. Re-run all cells to apply the change

The label model helps to denoise the weak supervision signals when labeling functions may conflict, while majority voting is a simpler approach that works well when labeling functions are reliable.