<a href="https://colab.research.google.com/github/Stefi96/DetectingNFTs-Master/blob/main/Domains_NFT_Master.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [35]:
# @title Lets import the libraries and code
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier

In [2]:
# Load the newly uploaded datasets
scam_data = pd.read_csv('/content/drive/MyDrive/Master/Project/Domains/final_blocklist_domains.csv')
legit_data = pd.read_csv('/content/drive/MyDrive/Master/Project/Domains/final_legit_domains.csv')

In [4]:
# Display the first few rows of each dataset for verification
scam_data.head(), legit_data.head()

(                   url
 0  blogpost-opensea.io
 1       phantomweb.app
 2           aurory.app
 3         solvision.io
 4        staratlas.art,
              Unique Domains
 0             basis.markets
 1          skeletoncrew.rip
 2              flippies.art
 3  meerkatmillionaires.club
 4           stylishstuds.io)

In [11]:
# label the data
scam_data['label'] = 1
legit_data['label'] = 0

# Renaming the 'Unique Domains' column in legit dataset to 'url'
legit_data.rename(columns={"Unique Domains": "url"}, inplace=True)

# Combine the datasets
combined_data = pd.concat([scam_data, legit_data], ignore_index=True)

# Display the first few rows of the combined dataset
combined_data.head()

Unnamed: 0,url,label
0,blogpost-opensea.io,1
1,phantomweb.app,1
2,aurory.app,1
3,solvision.io,1
4,staratlas.art,1


In [17]:
# Check for missing values
combined_data.isna().sum()

# Fill NaN with placeholder
combined_data['url'] = combined_data['url'].fillna("missing")

# Drop NA
#combined_data = combined_data.dropna()

In [18]:
combined_data['url']

0           blogpost-opensea.io
1                phantomweb.app
2                    aurory.app
3                  solvision.io
4                 staratlas.art
                 ...           
2446                 koinkoi.io
2447            burritoboyz.wtf
2448                   t00b.app
2449            outcast.academy
2450    apesofwallstreetnft.com
Name: url, Length: 2451, dtype: object

In [None]:
# # Convert the 'url' column to string
# combined_data['url'] = combined_data['url'].astype(str)

In [21]:
# Label encoding the URLs
label_encoder = LabelEncoder()
combined_data['url_encoded'] = label_encoder.fit_transform(combined_data['url'])

In [23]:
# Extracting features from the URLs
combined_data['domain_length'] = combined_data['url'].apply(lambda x: len(x))
combined_data['num_special_chars'] = combined_data['url'].apply(lambda x: sum([1 for char in x if not char.isalnum()]))
combined_data['num_subdomains'] = combined_data['url'].apply(lambda x: x.count('.'))
combined_data['has_http'] = combined_data['url'].apply(lambda x: 1 if 'http' in x else 0)
combined_data['has_https'] = combined_data['url'].apply(lambda x: 1 if 'https' in x else 0)
combined_data['has_www'] = combined_data['url'].apply(lambda x: 1 if 'www' in x else 0)
combined_data['has_numbers'] = combined_data['url'].apply(lambda x: 1 if any(char.isdigit() for char in x) else 0)

In [24]:
combined_data

Unnamed: 0,url,label,url_encoded,domain_length,num_special_chars,num_subdomains,has_http,has_https,has_www,has_numbers
0,blogpost-opensea.io,1,184,19,2,1,0,0,0,0
1,phantomweb.app,1,1422,14,1,1,0,0,0,0
2,aurory.app,1,133,10,1,1,0,0,0,0
3,solvision.io,1,2101,12,1,1,0,0,0,0
4,staratlas.art,1,2130,13,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
2446,koinkoi.io,0,801,10,1,1,0,0,0,0
2447,burritoboyz.wtf,0,267,15,1,1,0,0,0,0
2448,t00b.app,0,2180,8,1,1,0,0,0,1
2449,outcast.academy,0,1337,15,1,1,0,0,0,0


In [28]:
# # NLP transformations on the URLs
# tfidf_vectorizer = TfidfVectorizer(max_features=100)
# url_tfidf = tfidf_vectorizer.fit_transform(combined_data['url'])
# url_tfidf_df = pd.DataFrame(url_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
# combined_data = pd.concat([combined_data, url_tfidf_df], axis=1)
combined_data.drop(columns=['url'], inplace=True)

In [29]:
# Splitting the data
X = combined_data.drop("label", axis=1)
y = combined_data["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y) # maybe without stratify

In [30]:
# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
rf_metrics = (accuracy_score(y_test, rf_predictions), precision_score(y_test, rf_predictions), recall_score(y_test, rf_predictions), f1_score(y_test, rf_predictions))

rf_metrics

(0.7576374745417516, 0.811377245508982, 0.8287461773700305, 0.8199697428139182)

In [None]:
# Hyperparameter tuning for RandomForest
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(rf_model, param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_rf_clf = grid_search.best_estimator_
y_pred_best_rf = best_rf_clf.predict(X_test)

accuracy_best_rf = accuracy_score(y_test, y_pred_best_rf)
precision_best_rf = precision_score(y_test, y_pred_best_rf)
recall_best_rf = recall_score(y_test, y_pred_best_rf)
f1_best_rf = f1_score(y_test, y_pred_best_rf)

print("Tuned RandomForest Results:", accuracy_best_rf, precision_best_rf, recall_best_rf, f1_best_rf)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


In [36]:
# Random forrest with bagging
bagging_clf = BaggingClassifier(base_estimator=RandomForestClassifier(n_estimators=100, random_state=42),
                                n_estimators=10, random_state=42)
bagging_clf.fit(X_train, y_train)

# Predictions
y_pred_bagging = bagging_clf.predict(X_test)

# Evaluation
accuracy_bagging = accuracy_score(y_test, y_pred_bagging)
precision_bagging = precision_score(y_test, y_pred_bagging)
recall_bagging = recall_score(y_test, y_pred_bagging)
f1_bagging = f1_score(y_test, y_pred_bagging)

print("Accuracy:", accuracy_bagging)
print("Precision:", precision_bagging)
print("Recall:", recall_bagging)
print("F1-Score:", f1_bagging)



Accuracy: 0.7535641547861507
Precision: 0.8047337278106509
Recall: 0.8318042813455657
F1-Score: 0.8180451127819549


In [None]:
# # NLP preprocessing
# combined_data['url'].fillna("", inplace=True)
# combined_data['url'] = combined_data['url'].fillna("missing")
# combined_data['tokenized_url'] = combined_data['url'].str.split(r'\W+')
# combined_data['tokenized_url_str'] = combined_data['tokenized_url'].str.join(' ')
# vectorizer = TfidfVectorizer(ngram_range=(1, 2))
# X_tfidf = vectorizer.fit_transform(combined_data['tokenized_url_str'])
# X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf, combined_data['label'], test_size=0.3, random_state=42)

In [None]:
# # Just in case, check for NAN or empty strings
# nan_string_count = combined_data[combined_data['url'] == "NAN"].shape[0]
# nan_string_count

In [32]:
# Random Forest with TF-IDF
rf_clf_tfidf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf_tfidf.fit(X_train, y_train)
y_pred_rf_tfidf = rf_clf_tfidf.predict(X_test)
accuracy_rf_tfidf = accuracy_score(y_test, y_pred_rf_tfidf)
precision_rf_tfidf = precision_score(y_test, y_pred_rf_tfidf)
recall_rf_tfidf = recall_score(y_test, y_pred_rf_tfidf)
f1_rf_tfidf = f1_score(y_test, y_pred_rf_tfidf)

print("Random Forest with TF-IDF Metrics:")
print("Accuracy:", accuracy_rf_tfidf)
print("Precision:", precision_rf_tfidf)
print("Recall:", recall_rf_tfidf)
print("F1-Score:", f1_rf_tfidf)

Random Forest with TF-IDF Metrics:
Accuracy: 0.7576374745417516
Precision: 0.811377245508982
Recall: 0.8287461773700305
F1-Score: 0.8199697428139182


In [None]:
# # Random forrest with SMOTE balancer
# smote = SMOTE(random_state=42)
# X_train_smote, y_train_smote = smote.fit_resample(X_train_tfidf, y_train_tfidf)

# # Training Random Forest on the SMOTE balanced data
# rf_clf_smote = RandomForestClassifier(n_estimators=100, random_state=42)
# rf_clf_smote.fit(X_train_smote, y_train_smote)

# # Predictions
# y_pred_rf_smote = rf_clf_smote.predict(X_test_tfidf)

# # Evaluation
# accuracy_rf_smote = accuracy_score(y_test_tfidf, y_pred_rf_smote)
# precision_rf_smote = precision_score(y_test_tfidf, y_pred_rf_smote)
# recall_rf_smote = recall_score(y_test_tfidf, y_pred_rf_smote)
# f1_rf_smote = f1_score(y_test_tfidf, y_pred_rf_smote)

# accuracy_rf_smote, precision_rf_smote, recall_rf_smote, f1_rf_smote

In [31]:
# # LightGBM
# import lightgbm as lgb

# # Initializing the LightGBM model
# lgbm_model = lgb.LGBMClassifier(objective='binary', class_weight='balanced', random_state=42)

# # Training the model
# lgbm_model.fit(X_train, y_train)

# # Predictions
# y_pred_lgbm = lgbm_model.predict(X_test)

# # Evaluation
# accuracy_lgbm = accuracy_score(y_test, y_pred_lgbm)
# precision_lgbm = precision_score(y_test, y_pred_lgbm)
# recall_lgbm = recall_score(y_test, y_pred_lgbm)
# f1_lgbm = f1_score(y_test, y_pred_lgbm)

# accuracy_lgbm, precision_lgbm, recall_lgbm, f1_lgbm

[LightGBM] [Info] Number of positive: 1303, number of negative: 657
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 294
[LightGBM] [Info] Number of data points in the train set: 1960, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


(0.7596741344195519,
 0.8827838827838828,
 0.7370030581039755,
 0.8033333333333333)

In [9]:
# XGBoost
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train, y_train)
xgb_predictions = xgb_model.predict(X_test)
xgb_metrics = (accuracy_score(y_test, xgb_predictions), precision_score(y_test, xgb_predictions), recall_score(y_test, xgb_predictions), f1_score(y_test, xgb_predictions))

xgb_metrics

(0.845213849287169, 0.8885448916408669, 0.8776758409785933, 0.8830769230769231)

In [None]:
# SVM
svm_model = SVC(kernel='linear', probability=True, random_state=42)
svm_model.fit(X_train, y_train)
svm_predictions = svm_model.predict(X_test)
svm_metrics = (accuracy_score(y_test, svm_predictions), precision_score(y_test, svm_predictions), recall_score(y_test, svm_predictions), f1_score(y_test, svm_predictions))

svm_metrics

In [None]:
# Ensemble Method (RF + SVM)
ensemble_predictions = (rf_model.predict_proba(X_test)[:, 1] + svm_model.predict_proba(X_test)[:, 1]) / 2
ensemble_final_predictions = [1 if prob > 0.5 else 0 for prob in ensemble_predictions]