In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

In [2]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yopip\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yopip\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Load data from CSV
data = pd.read_csv('Data_Training.csv')

In [4]:
# Pre-processing teks (contoh: lowercase, menghapus karakter khusus, stopwords)

stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub('[^a-zA-Z0-9]', ' ', text)
    text = ' '.join([ps.stem(word) for word in text.split() if word not in stop_words])
    return text

data['narasi'] = data['narasi'].apply(preprocess_text)

In [5]:
# Pisahkan data menjadi fitur (X) dan label (y)
X = data['narasi']
y = data['label']

In [6]:
# Oversampling menggunakan RandomOverSampler karena adanya ketidakseimbangan kelas
oversampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X.values.reshape(-1, 1), y)

In [7]:
# Bagi data menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [8]:
# Vectorize data teks menggunakan TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_vectorized = vectorizer.fit_transform(X_train.flatten())
X_test_vectorized = vectorizer.transform(X_test.flatten())

In [9]:
# Naive Bayes
naive_bayes_model = GaussianNB()
naive_bayes_model.fit(X_train_vectorized.toarray(), y_train)

In [10]:
# Random Forest dengan hyperparameter tuning menggunakan GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [11]:
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, n_jobs=-1)
grid_search.fit(X_train_vectorized, y_train)
random_forest_model = grid_search.best_estimator_

In [12]:
# Prediksi menggunakan Naive Bayes
y_pred_naive_bayes = naive_bayes_model.predict(X_test_vectorized.toarray())

# Prediksi menggunakan Random Forest
y_pred_random_forest = random_forest_model.predict(X_test_vectorized)

# Evaluasi performa Naive Bayes
accuracy_naive_bayes = accuracy_score(y_test, y_pred_naive_bayes)
print('\n', classification_report(y_test, y_pred_naive_bayes))
print(f'Accuracy Naive Bayes: {accuracy_naive_bayes * 100:.2f}%\n')

# Evaluasi performa Random Forest
accuracy_random_forest = accuracy_score(y_test, y_pred_random_forest)
print('\n', classification_report(y_test, y_pred_random_forest))
print(f'Accuracy Random Forest: {accuracy_random_forest * 100:.2f}%')


               precision    recall  f1-score   support

           0       0.96      0.97      0.97       553
           1       0.97      0.96      0.97       538

    accuracy                           0.97      1091
   macro avg       0.97      0.97      0.97      1091
weighted avg       0.97      0.97      0.97      1091

Accuracy Naive Bayes: 96.88%


               precision    recall  f1-score   support

           0       1.00      0.98      0.99       553
           1       0.98      1.00      0.99       538

    accuracy                           0.99      1091
   macro avg       0.99      0.99      0.99      1091
weighted avg       0.99      0.99      0.99      1091

Accuracy Random Forest: 98.99%


### MODEL TESTING DAN MANUAL TESTING

In [13]:
from sklearn import metrics

def output_lable(n):
    if n == 0:
        return "HOAX"
    elif n == 1:
        return "REAL"

def data_test(news):
    # Preprocess the input text
    preprocessed_news = preprocess_text(news)

    testing_news = {"narasi": [preprocessed_news]}
    new_def_test = pd.DataFrame(testing_news)
    new_x_test = new_def_test["narasi"]
    new_xv_test = vectorizer.transform(new_x_test).toarray()

    # Predictions using Naive Bayes
    pred_naive_bayes = naive_bayes_model.predict(new_xv_test)

    # Predictions using Random Forest
    pred_random_forest = random_forest_model.predict(new_xv_test)

    print(' ')

    print(f'Accuracy Random Forest: {accuracy_random_forest * 100:.2f}%')
    print(f'Accuracy Naive Bayes: {accuracy_naive_bayes * 100:.2f}%\n')

    return print("\n Random Forest Classifier Prediction: {} \n Naive Bayes Classifier Prediction: {}".format(output_lable(y_pred_naive_bayes[0]),
                                                                                                                 output_lable(y_pred_random_forest[0])))


In [14]:
news_input = str(input('Enter a news text for testing: '))
data_test(news_input)

 
Accuracy Random Forest: 98.99%
Accuracy Naive Bayes: 96.88%


 Random Forest Classifier Prediction: REAL 
 Naive Bayes Classifier Prediction: REAL
