In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, classification_report

# Read the dataset
df = pd.read_csv("dataset.csv")

# Drop rows with missing values
df.dropna(subset=['review__text', 'preliminary_decision'], inplace=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['review__text'], df['preliminary_decision'], test_size=0.2, random_state=42)

# Define the pipeline
nb_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),  # Use TF-IDF for feature extraction
    ('nb', MultinomialNB())  # Naive Bayes classifier
])

# Define the parameter grid
param_grid = {
    'tfidf__ngram_range': [(1, 1), (1, 2)],  # Try different n-gram ranges
    'nb__alpha': [0.1, 1, 10]  # Smoothing parameter for Naive Bayes
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=nb_pipeline, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit GridSearchCV to the training data
grid_search.fit(X_train, y_train)

# Get the best parameters and best accuracy
best_params = grid_search.best_params_
best_accuracy = grid_search.best_score_

print("Best Accuracy:", best_accuracy)
print("Best Parameters:", best_params)

# Evaluate the model on the test set
y_pred = grid_search.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", test_accuracy)

# Calculate precision
precision = precision_score(y_test, y_pred, average='weighted')
print("Precision:", precision)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print some predictions
print("Some predictions:")
for i in range(5):
    print("Review:", X_test.iloc[i])
    print("True Label:", y_test.iloc[i])
    print("Predicted Label:", y_pred[i])
    print()




Best Accuracy: 0.6794871794871795
Best Parameters: {'nb__alpha': 0.1, 'tfidf__ngram_range': (1, 2)}
Test Accuracy: 0.6666666666666666
Precision: 0.4444444444444444
Classification Report:
                 precision    recall  f1-score   support

         accept       0.67      1.00      0.80        22
probably reject       0.00      0.00      0.00         1
         reject       0.00      0.00      0.00        10

       accuracy                           0.67        33
      macro avg       0.22      0.33      0.27        33
   weighted avg       0.44      0.67      0.53        33

Some predictions:
Review: The article presents the development of a mobile application, it does not include any element of research associated with said development. It simply focuses on the application of the RUP process for development. The texts or articles used for definitions or assertions made, for example, about RUP are not referenced "....it is focused on "diagrams of use cases, and risk management a

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
