import thư viện, xử lý dữ liệu

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

df = pd.read_excel("../data/dataset_testcases.xlsx")

text_columns = ['Website', 'Page', 'Component', 'Action', 'InputData', 'Expectation', 'ActualResult']
df['text'] = df[text_columns].fillna('').agg(' '.join, axis=1)

X = df['text']
y = df['IsBug']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

tạo pipeline

In [22]:
pipeline = Pipeline([
    ('vectorize', CountVectorizer()),
    ('clf', RandomForestClassifier(random_state=42))
])

thiết lập GridSearchCV

In [23]:
param_grid = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [None, 20, 50],
    'clf__class_weight': [None, 'balanced', {0:1, 1:2}, {0:1, 1:3}],
    'clf__min_samples_split': [2, 5]
}

grid = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='f1', n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


dự đoán trên tập test

In [24]:
y_pred = grid.predict(X_test)

print('Best parameters found:')
print(grid.best_params_)

print('\nAccuracy on test set:', accuracy_score(y_test, y_pred))
print('\nClassification report:')
print(classification_report(y_test, y_pred, digits=4))

Best parameters found:
{'clf__class_weight': {0: 1, 1: 3}, 'clf__max_depth': None, 'clf__min_samples_split': 2, 'clf__n_estimators': 100}

Accuracy on test set: 0.94875

Classification report:
              precision    recall  f1-score   support

           0     0.9911    0.9231    0.9559       481
           1     0.8949    0.9875    0.9389       319

    accuracy                         0.9487       800
   macro avg     0.9430    0.9553    0.9474       800
weighted avg     0.9527    0.9487    0.9491       800



Lưu mô hình

In [25]:
import joblib

joblib.dump(grid.best_estimator_, '../models/bug_prediction_model.pkl')
print("Model đã được lưu vào 'models/bug_prediction_model.pkl'")

Model đã được lưu vào 'models/bug_prediction_model.pkl'
