<a href="https://colab.research.google.com/github/ssk113/AetherWatch/blob/main/DNS_filtering_AI(ensemble_model).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Ensemble Learning**

## **Importing necessary libraries**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import accuracy_score


## **Loading dataset**

In [None]:
# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/DNS/new_data_urls.csv')

### **Preprocessing**

In [None]:
# Handle missing values (remove rows with missing data)
df.dropna(subset=['url', 'status'], inplace=True)

# Preprocess data (lowercase conversion)
df['url'] = df['url'].str.lower()


### **Training**

In [None]:
# Split the data into training and testing sets
X = df['url']
y = df['status']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### **Feature Extraction**

In [None]:
# Extract features from the URLs using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

'''
# Extract additional feature (URL length)
X_train_len = X_train.str.len().values.reshape(-1, 1)
X_test_len = X_test.str.len().values.reshape(-1, 1)

# Combine TF-IDF features with additional features
X_train_combined = np.concatenate((X_train_tfidf, X_train_len), axis=1)
X_test_combined = np.concatenate((X_test_tfidf, X_test_len), axis=1)

avg_url_len_train = X_train.str.len().mean()
avg_url_len_test = X_test.str.len().mean()

X_train_combined = np.concatenate((X_train_tfidf, np.array([avg_url_len_train]).reshape(1, -1)), axis=1)
X_test_combined = np.concatenate((X_test_tfidf, np.array([avg_url_len_test]).reshape(1, -1)), axis=1)

'''

'\n# Extract additional feature (URL length)\nX_train_len = X_train.str.len().values.reshape(-1, 1)\nX_test_len = X_test.str.len().values.reshape(-1, 1)\n\n# Combine TF-IDF features with additional features\nX_train_combined = np.concatenate((X_train_tfidf, X_train_len), axis=1)\nX_test_combined = np.concatenate((X_test_tfidf, X_test_len), axis=1)\n\navg_url_len_train = X_train.str.len().mean()\navg_url_len_test = X_test.str.len().mean()\n\nX_train_combined = np.concatenate((X_train_tfidf, np.array([avg_url_len_train]).reshape(1, -1)), axis=1)\nX_test_combined = np.concatenate((X_test_tfidf, np.array([avg_url_len_test]).reshape(1, -1)), axis=1)\n\n'

## **Loading Model**

### **Define models**

In [None]:
# Define the base models
base_models = [
    RandomForestClassifier(n_estimators=100, random_state=42),
    GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42),
    LogisticRegression(max_iter=1000, random_state=42)
]


### **Train base models**

In [None]:
#Train the base models
base_model_predictions = []
for model in base_models:
    model.fit(X_train_tfidf, y_train)
    predictions = model.predict_proba(X_test_tfidf)[:, 1]
    base_model_predictions.append(predictions)

## Train the base models
#base_model_predictions = []
#for model in base_models:
#    model.fit(X_train_combined, y_train)
#    predictions = model.predict_proba(X_test_combined)[:, 1]
#    base_model_predictions.append(predictions)

## **Meta Model**

### **Create Meta Model**

In [None]:
# Create a meta-model (neural network)
#meta_model = Sequential()
#meta_model.add(Dense(64, activation='relu', input_shape=(len(base_models),)))
#meta_model.add(Dense(32, activation='relu'))
#meta_model.add(Dense(1, activation='sigmoid'))
#meta_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Create a meta-model (neural network)
meta_model = Sequential()
meta_model.add(Dense(64, activation='relu', input_shape=(len(base_models) + 1,)))  # Add 1 for URL length feature
meta_model.add(Dense(32, activation='relu'))
meta_model.add(Dense(1, activation='sigmoid'))
meta_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

### **Dataset for Meta Model**

In [None]:
# Create a dataset for the meta-model
#meta_X = pd.DataFrame(base_model_predictions).T
#meta_y = y_test

# Create a dataset for the meta-model (include URL length feature)
meta_X = pd.DataFrame(base_model_predictions).T
meta_X['url_len'] = X_test_len.ravel()  # Add URL length as a separate feature
meta_y = y_test

### **Training**

In [None]:
# Train the meta-model
meta_model.fit(meta_X, meta_y, epochs=10, batch_size=32, verbose=0)


<keras.src.callbacks.History at 0x78ee0d1ae1d0>

## **Evaluate the model**

In [None]:
# Evaluate the stacking model
y_pred = meta_model.predict(meta_X)
y_pred_class = (y_pred > 0.5).astype(int)
print('Stacking model accuracy:', accuracy_score(y_test, y_pred_class))

Stacking model accuracy: 0.9082736219754017


### **Test the Model**

In [None]:
# Get the corresponding URLs for the test set
test_urls = X_test.tolist()

# Evaluate the stacking model
y_pred = meta_model.predict(meta_X)
y_pred_class = (y_pred > 0.5).astype(int)

# Print the predicted output with corresponding URLs
print("Predicted output:")
for i, (url, pred) in enumerate(zip(test_urls, y_pred_class)):
    print(f"URL: {url}, Predicted class: {pred}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
URL: maxpreps.com/news/rtapp1yvq0u_msgwakeuxq/maxpreps-names-california-all-state-football-teams.htm, Predicted class: [1]
URL: albel.intnet.mu/file/file_doc.php?l=_jehfuq_vjoxjogydw_oxk0k0qwhtogydw_product-userid&userid_jehjoxk0idw_joxk0idd&userid=, Predicted class: [0]
URL: iueye.iu.edu/faculty/, Predicted class: [1]
URL: turfking.in, Predicted class: [0]
URL: rottentomatoes.com/celebrity/alexei_kravchenko/biography.php, Predicted class: [1]
URL: digilander.libero.it/dilucadanilo/, Predicted class: [0]
URL: tutorarabia.com/t67bg, Predicted class: [1]
URL: http://www.aosouuamsouuou.phsivca.presse.ci/au/page1.php, Predicted class: [0]
URL: www.alanmorgan.plus.com/g76gyui, Predicted class: [1]
URL: haqueandassociates.com, Predicted class: [1]
URL: websters-online-dictionary.org/definition/bowser, Predicted class: [1]
URL: '9d345009-a-62cb3a1a-s-sites.googlegroups.com/site/stickamcomlogindo/login.html?attachauth=anoy7cqlgji

### **Save Model**

In [None]:
# Save the stacking model
meta_model.save('stacking_model.h5')

  saving_api.save_model(
