In [1]:
!pip install scikit-learn==1.2.1

Collecting scikit-learn==1.2.1
  Downloading scikit_learn-1.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading scikit_learn-1.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m46.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.4.1.post1
    Uninstalling scikit-learn-1.4.1.post1:
      Successfully uninstalled scikit-learn-1.4.1.post1
Successfully installed scikit-learn-1.2.1


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib

# Load the dataset
data = pd.read_csv("malicious_phish.csv")

# Feature extraction
X = data['url']
y = data['type']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert URLs into features using TF-IDF
vectorizer = TfidfVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# Train Decision Tree classifier
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train_vect, y_train)

# Evaluate Decision Tree model
dt_y_pred = dt_model.predict(X_test_vect)
print("Decision Tree Accuracy:", accuracy_score(y_test, dt_y_pred))
print(classification_report(y_test, dt_y_pred))

# Saving the model
joblib.dump(dt_model, "dt_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

# Predicting new URLs
new_urls = ["http://example.com", "http://malicious-site.com"]
new_urls_vect = vectorizer.transform(new_urls)
dt_predictions = dt_model.predict(new_urls_vect)
print("Decision Tree Predictions:", dt_predictions)


Decision Tree Accuracy: 0.9238246608158847
              precision    recall  f1-score   support

      benign       0.94      0.95      0.95     85778
  defacement       0.97      0.99      0.98     19104
     malware       0.97      0.92      0.94      6521
    phishing       0.76      0.73      0.75     18836

    accuracy                           0.92    130239
   macro avg       0.91      0.90      0.90    130239
weighted avg       0.92      0.92      0.92    130239

Decision Tree Predictions: ['phishing' 'phishing']
