In [2]:
!pip install scikit-learn==1.2.1



In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import joblib

# Load the dataset
data = pd.read_csv("malicious_phish.csv")

# Feature extraction
X = data['url']
y = data['type']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert URLs into features using TF-IDF
vectorizer = TfidfVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# Train Multinomial Naive Bayes classifier
nb_model = MultinomialNB()
nb_model.fit(X_train_vect, y_train)

# Evaluate Multinomial Naive Bayes model
nb_y_pred = nb_model.predict(X_test_vect)
print("Multinomial Naive Bayes Accuracy:", accuracy_score(y_test, nb_y_pred))
print(classification_report(y_test, nb_y_pred))

# Saving the model
joblib.dump(nb_model, "nb_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

# Predicting new URLs
new_urls = ["http://example.com", "http://malicious-site.com"]
new_urls_vect = vectorizer.transform(new_urls)
nb_predictions = nb_model.predict(new_urls_vect)
print("Multinomial Naive Bayes Predictions:", nb_predictions)


Multinomial Naive Bayes Accuracy: 0.8886124739901259
              precision    recall  f1-score   support

      benign       0.86      0.99      0.92     85778
  defacement       0.96      0.95      0.95     19104
     malware       0.98      0.84      0.90      6521
    phishing       0.97      0.36      0.53     18836

    accuracy                           0.89    130239
   macro avg       0.94      0.79      0.83    130239
weighted avg       0.90      0.89      0.87    130239

Multinomial Naive Bayes Predictions: ['phishing' 'benign']
