# Importing Libraries


In [None]:
import pandas as pd
from numpy import mean
from sklearn import set_config
from google.colab import drive
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.base import TransformerMixin, BaseEstimator
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer, Binarizer
from sklearn.model_selection import cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef, r2_score
set_config(display="diagram")

# Read and Prepare Data

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
life_time_df = pd.read_csv('/content/drive/MyDrive/Thesis/Life_time_full_log.csv')

# Replace email-like values with just the name
life_time_df['Author'] = life_time_df['Author'].str.extract(r'(.+?)\s+<.+?>', expand=False)

# Quick 15: Tests Fixed within 15 Days

In [None]:
#, usecols=['Description', 'Life_Time']
#life_time_df = pd.read_csv('/content/drive/MyDrive/Thesis/Life_time_full_log_2023.csv', usecols=['Description', 'Life_Time'])

# Remove records with NaN or -1 values in the 'Life_Time' column aka tests that never got fixed
# life_time_df = data[data['Life_Time'].notna() & (data['Life_Time'] != -1)]

life_time_df['label'] = 0
for i in range(len(life_time_df)):
    if -1 < life_time_df.iloc[i].Life_Time <= 15:
        life_time_df['label'].iloc[i] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  life_time_df['label'].iloc[i] = 1


# Quick 10: Tests Fixed within 10 Days

In [None]:
life_time_df['label'] = 0
for i in range(len(life_time_df)):
    if -1 < life_time_df.iloc[i].Life_Time <= 10:
        life_time_df['label'].iloc[i] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  life_time_df['label'].iloc[i] = 1


In [None]:
print(0.8 * len(life_time_df))
test_start_index  = round(0.8 * len(life_time_df))
dataTrain = life_time_df.iloc[0:test_start_index]
dataTest = life_time_df.iloc[test_start_index:]


dataTrain.head(3)

35397.6


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Commit_hash,Author,Date,Description,Bug_Id,Change_Id,Reviewed-on,Test,Expectation,Build_type,Action,Life_Time,label
0,82961,82961,77578ccb4082ae20a9326d9e673225f1189ebb63\n,Kent Tamura,2018-11-25,\n The Great Blink mv for LayoutTests\n ...,843412\n,Ibb588b93d1579bcd1cb68df0a50efd8653f8724f\n,https://chromium-review.googlesource.com/c/132...,virtual/site-isolated-code-cache/http/tests/d...,failure pass,Mac10.13,add,82,0
1,82960,82960,77578ccb4082ae20a9326d9e673225f1189ebb63\n,Kent Tamura,2018-11-25,\n The Great Blink mv for LayoutTests\n ...,843412\n,Ibb588b93d1579bcd1cb68df0a50efd8653f8724f\n,https://chromium-review.googlesource.com/c/132...,virtual/not-site-per-process/http/tests/devto...,failure pass,Mac10.13,add,66,0
2,82959,82959,77578ccb4082ae20a9326d9e673225f1189ebb63\n,Kent Tamura,2018-11-25,\n The Great Blink mv for LayoutTests\n ...,843412\n,Ibb588b93d1579bcd1cb68df0a50efd8653f8724f\n,https://chromium-review.googlesource.com/c/132...,virtual/mouseevent_fractional/fast/events/midd...,skip,,add,432,0


In [None]:
import warnings

# Filter out the specific UserWarning
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn.utils.parallel")
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn.utils.deprecation")

In [None]:
print(len(life_time_df[life_time_df['label'] == 0]))
print(len(life_time_df[life_time_df['label'] == 1]))
print(len(life_time_df[life_time_df['label'] == 1])/len(life_time_df))

26130
18117
0.40945148823649063


In [None]:
# Columns definition
description_col = "Description"
cat_cols = ["Author"] #["Author", "Build_type"]
expectation_col = "Expectation"
other_cols = "Expectation"

cols_trans = ColumnTransformer([
    #('categories', OneHotEncoder(handle_unknown = "ignore"), cat_cols),
    ('Description', CountVectorizer(max_features=400), description_col),
    #('Expectation', CountVectorizer(max_features=10), expectation_col),
], remainder='passthrough')

X_train = dataTrain[[description_col]]
X_test = dataTest[[description_col]]

y_train = dataTrain["label"]
y_test = dataTest["label"]

In [None]:
# Pipeline
smote = SMOTE(sampling_strategy=0.8)
featureSelection = SelectKBest(chi2, k=60)
rfc = BalancedRandomForestClassifier(n_estimators=400, n_jobs=14, verbose=1)

steps = [
    ('trans', cols_trans),
     #('fs', featureSelection),
     ('s', smote),
    ('m', rfc)
]
pipe = Pipeline(steps=steps)

In [None]:
# Fit and test
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

# Scores
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

[Parallel(n_jobs=14)]: Using backend ThreadingBackend with 14 concurrent workers.
[Parallel(n_jobs=14)]: Done  22 tasks      | elapsed:    4.0s
[Parallel(n_jobs=14)]: Done 172 tasks      | elapsed:   25.0s
[Parallel(n_jobs=14)]: Done 400 out of 400 | elapsed:   55.8s finished
[Parallel(n_jobs=14)]: Using backend ThreadingBackend with 14 concurrent workers.
[Parallel(n_jobs=14)]: Done  22 tasks      | elapsed:    0.0s
[Parallel(n_jobs=14)]: Done 172 tasks      | elapsed:    0.2s
[Parallel(n_jobs=14)]: Done 400 out of 400 | elapsed:    0.4s finished


In [None]:
print("\nPrecision", precision)
print("Recall", recall)
print("MCC", mcc)
print("F1", f1)


Precision 0.6197632058287796
Recall 0.40493900624814044
MCC 0.28403660312718826
F1 0.4898326435126867


Changing oversampling rate to 0.7 and 0.9 decreased the performance.

When it's not mentioned n_stimator is 200, max_feature =  200

# Result for 2022 Data  with -1 values max feature = 400 and n_stimators = 400 with ture seld oversampling = 0.8

Precision 73.6%

Recall 57.4%

MCC 0.397

F1 64.5 %


# Result for 2022 Data  with -1 values max feature = 200
Precision 0.687

Recall 0.58

MCC 0.346

F1 0.63

# Result for 2022 Data  with -1 values max feature = 400

Precision 68.8 %

Recall 59.3 %

MCC 0.35

F1 63.7 %


# Result for 2022 Data  with -1 values

Precision 0.67

Recall 0.62

MCC 0.34

F1 0.64

# Result for 2022 Data  with -1 values + feature selection + oversampling = 0.8

Precision 0.67

Recall 0.60

MCC 0.34

F1 0.64

# Result for 2022 Data without -1 values

Precision 0.5833333333333334

Recall 0.5052567731500203

MCC -0.12908473200139

F1 0.5414951245937162

R2 -1.3703152240330136

# Result for 2022 Data without -1 values with ovwersampling = 0.8

Precision 0.6260652670962378

Recall 0.608976951071573

MCC -0.03456392484930855

F1 0.6174028902326535

R2 -1.0908286227115407

# Result for 2022 Data with -1 values with ovwersampling = 0.8

Precision 0.6419058893163658

Recall 0.5115566037735849

MCC 0.2560616622141068

F1 0.5693660585378658

R2 -0.4856888389001017

# Result for 2022 Data with -1 values with Expectation and Authur features

Precision 0.6342333163525217

Recall 0.5872641509433962

MCC 0.2772251964195138

F1 0.6098457016899339

R2 -0.44267133213524046

# Result for 2022 Data with -1 values with Expectation and Authur features + feature selection


Precision 0.6644245142002989

Recall 0.6290094339622642

MCC 0.33791977218438674

F1 0.6462321298764235

R2 -0.32222231319362904

# Result for 2022 Data with -1 values with Expectation and Authur features + feature selection + oversampling

Precision 0.6743831086237598

Recall 0.6252358490566038

MCC 0.34939624005401393

F1 0.6488801860237424

R2 -0.2991287042988089

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred)

plt.imshow(cm, interpolation='nearest', cmap='Blues')
plt.colorbar()
tick_marks = [0, 1]
plt.xticks(tick_marks, ["Non-Flaky", "Flaky"], rotation=45)
plt.yticks(tick_marks, ["Non-Flaky", "Flaky"])
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.show()

# 5-fold Cross Validation

In [None]:
from sklearn.model_selection import KFold
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, matthews_corrcoef

k_folds = 5
num_epochs = 1

# For fold results
results = {'accuracy': [],'mcc': [], 'precision': [], 'recall': []}

#
# Create an instance of Pipeline

# Create an instance of StratifiedKFold which can be used to get indices of different training and test folds
#
strtfdKFold = StratifiedKFold(n_splits=10)
kfold = strtfdKFold.split(X_train, y_train)
scores = []
#

for k, (train, test) in enumerate(kfold):
  # Fit and test
  pipe.fit(X_train.iloc[train, :], y_train.iloc[train])
  y_pred = pipe.predict(X_test)

  # Evaluate metrics
  accuracy = accuracy_score(y_test, y_pred)
  mcc = matthews_corrcoef(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)

  # Append metrics to results dictionary
  results['accuracy'].append(accuracy)
  results['mcc'].append(mcc)
  results['precision'].append(precision)
  results['recall'].append(recall)

  # score = pipe.score(X_train.iloc[test, :], y_train.iloc[test])
  # scores.append(score)
  # print('Fold: %2d, Training/Test Split Distribution: %s, Accuracy: %.3f' % (k+1, np.bincount(y_train.iloc[train]), score))

# print('\n\nCross-Validation accuracy: %.3f +/- %.3f' %(np.mean(scores), np.std(scores)))

In [None]:
# Calculate and print average results
average_results = {
    'accuracy': np.mean(results['accuracy']),
    'mcc': np.mean(results['mcc']),
    'precision': np.mean(results['precision']),
    'recall': np.mean(results['recall'])
}

std_dev_results = {
    'accuracy': np.std(results['accuracy']),
    'mcc': np.std(results['mcc']),
    'precision': np.std(results['precision']),
    'recall': np.std(results['recall'])
}

print('\n\nCross-Validation Results:')
for metric in average_results:
  print(f"{metric.capitalize()}: {average_results[metric]:.3f} +/- {std_dev_results[metric]:.3f}")

# Window Training: train on 6 months, test on next month


In [None]:
import pandas as pd

# Load your dataset (replace 'your_data.csv' with the actual file name)
df = life_time_df

# Assuming you have a date column in your dataset, convert it to a datetime object
df['Date'] = pd.to_datetime(df['Date'])

# Sort the dataset by date
df = df.sort_values(by='Date')

# Define the size of the training window (6 months) and testing window (1 month)
train_window_size = pd.DateOffset(months=6)
test_window_size = pd.DateOffset(months=1)

# Initialize the start date for the training window
train_start_date = df['Date'].iloc[0]

precision_results = []
recall_results = []
mcc_results = []
f1_results = []

while True:
    # Calculate the end date for the training window and testing window
    train_end_date = train_start_date + train_window_size
    test_start_date = train_end_date
    test_end_date = test_start_date + test_window_size

    # Filter the data for the training and testing window
    train_data = df[(df['Date'] >= train_start_date) & (df['Date'] < test_start_date)]
    test_data = df[(df['Date'] >= test_start_date) & (df['Date'] < test_end_date)]

    X_train = train_data[[description_col]]
    X_test = test_data[[description_col]]

    y_train = train_data["label"]
    y_test = test_data["label"]

    if len(train_data) == 0 or len(test_data) == 0:
      print("Training or testing data is empty. Skipping this iteration.")
      # Move the training window forward by one month
      train_start_date = train_start_date + pd.DateOffset(months=1)
      break

    # Train your model on train_data and test on test_data
    # Fit and test
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    # Scores
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    precision_results.append(precision)
    recall_results.append(recall)
    mcc_results.append(mcc)
    f1_results.append(f1)


    # Check if the next training window exceeds the last date in the dataset
    if train_end_date >= df['Date'].iloc[-1]:
        break

    # Move the training window forward by one month
    train_start_date = train_start_date + pd.DateOffset(months=1)


In [None]:
print("Average Precision: ", sum(precision_results)/len(precision_results))
print("Average Recall: ", sum(recall_results)/len(recall_results))
print("Average MCC: ", sum(mcc_results)/len(mcc_results))
print("Average F1: ", sum(f1_results)/len(f1_results))

In [None]:
print(min(precision_results))
print(max(precision_results))

In [None]:
print(min(mcc_results))
print(max(mcc_results))

# window training: Train on 3 months, test on 1 month

In [None]:
import pandas as pd

# Load your dataset (replace 'your_data.csv' with the actual file name)
df = life_time_df

# Assuming you have a date column in your dataset, convert it to a datetime object
df['Date'] = pd.to_datetime(df['Date'])

# Sort the dataset by date
df = df.sort_values(by='Date')

# Define the size of the training window (3 months) and testing window (1 month)
train_window_size = pd.DateOffset(months=2)
test_window_size = pd.DateOffset(months=1)

# Initialize the start date for the training window
train_start_date = df['Date'].iloc[0]

precision_results = []
recall_results = []
mcc_results = []
f1_results = []

while True:
    # Calculate the end date for the training window and testing window
    train_end_date = train_start_date + train_window_size
    test_start_date = train_end_date
    test_end_date = test_start_date + test_window_size

    # Filter the data for the training and testing window
    train_data = df[(df['Date'] >= train_start_date) & (df['Date'] < test_start_date)]
    test_data = df[(df['Date'] >= test_start_date) & (df['Date'] < test_end_date)]

    X_train = train_data[[description_col]]
    X_test = test_data[[description_col]]

    y_train = train_data["label"]
    y_test = test_data["label"]

    if len(train_data) == 0 or len(test_data) == 0:
      print("Training or testing data is empty. Skipping this iteration.")
      # Move the training window forward by one month
      train_start_date = train_start_date + pd.DateOffset(months=1)
      break

    # Train your model on train_data and test on test_data
    # Fit and test
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    # Scores
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    precision_results.append(precision)
    recall_results.append(recall)
    mcc_results.append(mcc)
    f1_results.append(f1)


    # Check if the next training window exceeds the last date in the dataset
    if train_end_date >= df['Date'].iloc[-1]:
        break

    # Move the training window forward by one month
    train_start_date = train_start_date + pd.DateOffset(months=1)


In [None]:
print("Average Precision: ", sum(precision_results)/len(precision_results))
print("Average Recall: ", sum(recall_results)/len(recall_results))
print("Average MCC: ", sum(mcc_results)/len(mcc_results))
print("Average F1: ", sum(f1_results)/len(f1_results))

In [None]:
print(min(precision_results))
print(max(precision_results))