# **Data Preparation**

## Library Import

In [1]:
import numpy as np
import pandas as pd
import os
import pickle
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import GaussianNB

## Dataset

In [2]:
current_dir = os.path.dirname(os.path.abspath("__file__"))
train_path = os.path.join(current_dir, "../data/train.csv")
test_path = os.path.join(current_dir, "../data/test.csv")

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

## Data Preprocessing

In [3]:
def preprocess_data(train_df, test_df=None, test_size=0.3, random_state=42):
    X = train_df.drop(['label', 'id', 'FILENAME', 'URL', 'Domain'], axis=1)
    y = train_df['label']

    numeric_columns = X.select_dtypes(include=['number']).columns
    categorical_columns = X.select_dtypes(exclude=['number']).columns

    for col in numeric_columns:
        X[col] = np.log1p(X[col])

    for col in numeric_columns:
        X[col] = X[col].fillna(X[col].median())

    for col in categorical_columns:
        X[col] = X[col].fillna(X[col].mode()[0])

    scaler = StandardScaler()
    X[numeric_columns] = scaler.fit_transform(X[numeric_columns])

    label_encoders = {}
    for col in categorical_columns:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])
        label_encoders[col] = le

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=random_state
    )

    smote = SMOTE(random_state=random_state, k_neighbors=1)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    if test_df is not None:
        X_test_final = test_df.drop(['id', 'FILENAME', 'URL', 'Domain'], axis=1)
        for col in numeric_columns:
            X_test_final[col] = np.log1p(X_test_final[col])
            X_test_final[col] = X_test_final[col].fillna(X_test_final[col].median())
        for col in categorical_columns:
            X_test_final[col] = X_test_final[col].fillna(X_test_final[col].mode()[0])
            X_test_final[col] = X_test_final[col].map(lambda val: label_encoders[col].transform([val])[0]
                                                      if val in label_encoders[col].classes_
                                                      else -1)
        X_test_final[numeric_columns] = scaler.transform(X_test_final[numeric_columns])
    else:
        X_test_final = None

    return X_train_resampled, X_test, y_train_resampled, y_test, X_test_final

## Naive Bayes Implementation from Scratch

In [4]:
class NaiveBayes:
    # NaiveBayes class inisialization
    def __init__(self, smoothing=1e-3, prior_adjustment=None):
        self.smoothing = smoothing
        self.classes_ = None
        self.class_probabilities = {}
        self.feature_probabilities = {}
        self.class_counts = {}
        self.prior_adjustment = prior_adjustment

    # Trains the Naive Bayes model by calculating prior probabilities for each class 
    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)

        self.classes_ = np.unique(y)
        n_samples, n_features = X.shape

        for cls in self.classes_:
            self.class_probabilities[cls] = np.sum(y == cls) / n_samples
            if self.prior_adjustment and cls in self.prior_adjustment:
                self.class_probabilities[cls] *= self.prior_adjustment[cls]

            self.class_counts[cls] = np.sum(y == cls)

        # Calculate the prior probability for each class
        self.feature_probabilities = {cls: [] for cls in self.classes_}
        for cls in self.classes_:
            X_cls = X[y == cls]
            for feature_idx in range(n_features):
                feature_vals = X_cls[:, feature_idx]
                unique_vals, counts = np.unique(feature_vals, return_counts=True)
                feature_prob = {
                    val: (count + self.smoothing) / (self.class_counts[cls] + self.smoothing * len(unique_vals))
                    for val, count in zip(unique_vals, counts)
                }
                self.feature_probabilities[cls].append(feature_prob)

        return self
    
    # This method calculates the probability of each class for each sample in X
    def predict_proba(self, X):
        X = np.array(X)
        probabilities = []
        for sample in X:
            posteriors = []
            for cls in self.classes_:
                score = np.log(self.class_probabilities[cls] + self.smoothing)
                for feature_idx, feature_val in enumerate(sample):
                    feature_prob = self.feature_probabilities[cls][feature_idx].get(feature_val, self.smoothing)
                    score += np.log(feature_prob + self.smoothing)
                posteriors.append(np.exp(score))
            probabilities.append(posteriors / np.sum(posteriors))
        return np.array(probabilities)
    
    # This method determines the predicted class for each sample in the input data
    def predict(self, X, threshold=0.005):
        probabilities = self.predict_proba(X)
        predictions = (probabilities[:, 1] >= threshold).astype(int)

        for i, prob in enumerate(probabilities):
            if prob[0] > prob[1] * 0.8: 
                predictions[i] = 0

        return predictions

    # Saves a trained model to a file.
    def save_model(self, filename):
        with open(filename, 'wb') as file:
            pickle.dump(self, file)
        print(f"Model saved in {filename}.")

    # Loads a previously saved model from a file for reuse.
    @staticmethod
    def load_model(filename):
        with open(filename, 'rb') as file:
            model = pickle.load(file)
        print(f"Model loaded from {filename}.")
        return model

## Evaluate Model

In [5]:
X_train, X_test, y_train, y_test, X_test_final = preprocess_data(train_df, test_df)

# Train and evaluate model
nb = NaiveBayes(prior_adjustment={0: 50.0, 1: 1.0})
nb.fit(X_train, y_train)
nb.save_model('naive_bayes_model.pkl')

# Perform cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cross_val_scores = []

X_train_np = np.array(X_train)
y_train_np = np.array(y_train)

for train_idx, val_idx in kf.split(X_train_np):
    X_cv_train, X_cv_val = X_train_np[train_idx], X_train_np[val_idx]
    y_cv_train, y_cv_val = y_train_np[train_idx], y_train_np[val_idx]

    nb_cv = NaiveBayes(prior_adjustment={0: 50.0, 1: 1.0})
    nb_cv.fit(X_cv_train, y_cv_train)
    y_cv_pred = nb_cv.predict(X_cv_val)
    cross_val_scores.append(accuracy_score(y_cv_val, y_cv_pred))

print(f"IMPLEMENTATION FROM SCRATCH")
print(f"Cross-Validation Accuracy (Mean): {np.mean(cross_val_scores) * 100:.2f}%")
print(f"Cross-Validation Accuracy (Standard Deviation): {np.std(cross_val_scores) * 100:.2f}%")

# Evaluate the final model on the test set
y_pred = nb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"\nNaive Bayes kustom classification accuracy: {accuracy * 100:.2f}%\n")
print("Detailed Classification Report (Custom Naive Bayes):")
print(classification_report(y_test, y_pred))

# Save predictions to a CSV file
predictions = nb.predict(X_test_final)
submission_df = pd.DataFrame({
    "id": test_df["id"],
    "label": predictions
})
submission_file_path = 'submission-nb-scratch.csv'
submission_df.to_csv(submission_file_path, index=False)

print(f"Predictions saved to '{submission_file_path}'.")

KeyboardInterrupt: 

## Naive Bayes Implementation with Scikit-Learn

## Evaluate Model

In [32]:
X_train, X_test, y_train, y_test, X_test_final = preprocess_data(train_df, test_df)

# Train and evaluate model
model = GaussianNB()
model.fit(X_train.values, y_train.values)

# Perform cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cross_val_scores = []

X_train_np = X_train.values
y_train_np = y_train.values

for train_idx, val_idx in kf.split(X_train_np):
    X_cv_train, X_cv_val = X_train_np[train_idx], X_train_np[val_idx]
    y_cv_train, y_cv_val = y_train_np[train_idx], y_train_np[val_idx]

    model.fit(X_cv_train, y_cv_train)
    y_cv_pred = model.predict(X_cv_val)
    cross_val_scores.append(accuracy_score(y_cv_val, y_cv_pred))

print(f"IMPLEMENTATION WITH SCIKIT-LEARN")
print(f"Cross-Validation Accuracy (Mean): {np.mean(cross_val_scores) * 100:.2f}%")
print(f"Cross-Validation Accuracy (Standard Deviation): {np.std(cross_val_scores) * 100:.2f}%")

# Evaluate the final model on the test set
y_pred = model.predict(X_test.values) 
accuracy = accuracy_score(y_test, y_pred)

print(f"\nNaive Bayes classification accuracy: {accuracy * 100:.2f}%\n")
print("Detailed Classification Report:")
print(classification_report(y_test, y_pred))

# Save predictions to a CSV file
predictions = model.predict(X_test_final.values)
submission_df = pd.DataFrame({
    "id": test_df["id"],
    "label": predictions
})
submission_file_path = 'submission-nb-scikit-learn.csv'
submission_df.to_csv(submission_file_path, index=False)

print(f"Predictions saved to '{submission_file_path}'.")


IMPLEMENTATION WITH SCIKIT-LEARN
Cross-Validation Accuracy (Mean): 92.62%
Cross-Validation Accuracy (Standard Deviation): 0.11%

Naive Bayes classification accuracy: 98.77%

Detailed Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.87      0.91      3166
           1       0.99      1.00      0.99     38956

    accuracy                           0.99     42122
   macro avg       0.97      0.94      0.95     42122
weighted avg       0.99      0.99      0.99     42122

Predictions saved to 'submission-nb-scikit-learn.csv'.
