In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install tldextract

In [None]:
import math
from collections import Counter
import tldextract

### Data aquisition

First off we'll start by reading our data which we uploaded, data is split into two datasets. One coming from `Alexa's` ranking and another from various `DGA` domains

Our goal is to build a classifier which can differentiate between a legitimate domain name and a potential random one.

In [None]:
# Reading dga dataset
columns_dga = ["source","domain_name","date","second","first", "time"]

dga_domains = pd.read_csv("/kaggle/input/domain-generation-algorithm/dga_project_dga_domain_list_clean.txt", sep="\s+", names=columns_dga)
dga_domains.head()

In [None]:
# Removing none useful columns 
dga_domains.drop(["source", "date", "second", "first", "time"], axis=1, inplace=True)
dga_domains["label"] = "dga"

print("DGA Dataframe has {} rows".format(dga_domains.shape[0]))
dga_domains.head()

In [None]:
# Readin Alexa data
legit_columns = ["domain_name"]

# legit_domains = pd.read_csv("/kaggle/input/domain-generation-algorithm/dga_project_top-1m.csv", names=legit_columns)
legit_domains = pd.read_csv("/kaggle/input/domain-generation-algorithm/top-1m.csv", names=legit_columns)
legit_domains["label"] = "legit"

print("Legit Dataframe has {} rows".format(legit_domains.shape[0]))
legit_domains.head()

In [None]:
# Adjusting datasets sizes and asserting that
dga_domains = dga_domains.loc[:legit_domains.shape[0]-1, :]

assert dga_domains.shape[0] == legit_domains.shape[0]

In [None]:
# Concatenating both datasets
data = pd.concat([dga_domains, legit_domains])

print("Whole Dataset has {} rows".format(data.shape[0]))
assert data.shape[0] == legit_domains.shape[0] * 2
data.head()

In [None]:
def entropy(domain_name):
    """ Function which computes the entropy of a given domain name based on it's chars """
    elements, length = Counter(domain_name), len(domain_name)    
    
    return -sum(element/length * math.log(element/length, 2) for element in elements.values())

In [None]:
def get_domain_name(domain):
    """ Function which extracts domain name from subdomain name """
    res = tldextract.extract(domain)
    return res.domain if len(res.domain) > len(res.subdomain) or entropy(res.domain) > entropy(res.subdomain) else res.subdomain


data["domain"] = data["domain_name"].apply(lambda domain: get_domain_name(domain))

In [None]:
# Applying functions above
data["length"] = data["domain"].apply(lambda domain: len(domain))
data["entropy"] = data["domain"].apply(lambda domain: entropy(domain))
data.head()

### Few visual representation of those predictors by label

In [None]:
# Plots of those two columns

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("darkgrid")
plt.rcParams["figure.figsize"] = (12,8)
font = {"size"   : 11}

plt.rc('font', **font)


mean_entropy = data.groupby("label")["entropy"].agg("mean")
sns.barplot(mean_entropy.index, mean_entropy.values, palette="viridis")

plt.ylabel("Mean of Entropy")
plt.title("Mean of Entropy by class")
plt.show()

In [None]:
# plots of the mean entropy by class

mean_length = data.groupby("label")["length"].agg("mean")
sns.barplot(mean_length.index, mean_length.values, palette="viridis")

plt.ylabel("Mean of Length")
plt.title("Mean of Entropy by class")
plt.show()

### Preparing data for training, we'll use a boosting model then iterate over

In [None]:
# Creating feature and target vector

X = data[["length", "entropy"]]
y = data["label"]

In [None]:
# Creating train and test dataset with respect to a prior shuflle and a strtify

from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, shuffle=True, stratify=y)

In [None]:
# Target variable encoding

from sklearn.preprocessing import LabelEncoder

lb = LabelEncoder()

y_train = lb.fit_transform(y_train)
y_test = lb.transform(y_test)

In [None]:
# Model training and prediction using lightbgm

import lightgbm as lgb


lgb_clf = lgb.LGBMClassifier()

lgb_clf.fit(X_train, y_train)
y_pred = lgb_clf.predict(X_test)

In [None]:
# f1 score metric

from sklearn.metrics import f1_score


print("Evaluation f1 score metric is {}%".format(round(f1_score(y_test, y_pred) * 100), 2))

In [None]:
# Confusion matrix

from sklearn.metrics import plot_confusion_matrix


plot_confusion_matrix(lgb_clf, X_test, y_test, display_labels=["legit", "dga"])

Well clearly we can try to do better and work on some more complex Feature Engineering based on the lexical constructindisplay_labels=ach domain name.

We'll try to leverage how many occurence of grams we can find for each domain name in alexa grams (since they are legitimate) and in grams of word of an English dicitonary (could have added other languages...french, german etc..)

In [None]:
# A choice of 3 to 5 grams seems okay

import sklearn.feature_extraction


split_condition = data["label"] == "legit"
legit = data[split_condition]
dga = data[~split_condition]


alexa_vc = sklearn.feature_extraction.text.CountVectorizer(analyzer="char", ngram_range=(3,5), min_df=0.00001, max_df=1.0)
counts_matrix = alexa_vc.fit_transform(legit["domain"])

counts_matrix

In [None]:
alexa_counts = np.log10(np.asarray(counts_matrix.sum(axis=0)).flatten())
ngrams_list = alexa_vc.get_feature_names()
print(ngrams_list[100:200])

In [None]:
sorted_ngrams = sorted(zip(ngrams_list, alexa_counts), key=lambda alexa_count: alexa_count[1], reverse=True)
print("Alexa NGrams {}".format(len(sorted_ngrams)))
for ngram, count in sorted_ngrams[:10]:
    print(ngram, count)

In [None]:
data["alexa_grams"] = alexa_counts * alexa_vc.transform(data["domain"]).T
data[data["label"] == "legit"].head()

In [None]:
words_df = pd.read_csv("/kaggle/input/domain-generation-algorithm/words.txt", names=["word"],
                             encoding="utf-8", header=None, dtype={"word": np.str})
words_df.head()

In [None]:
def clean_words_df(word):
    return str(word).strip().lower()

def keep_alphanumeric(word):
    return str(word).isalpha()



words_df = words_df[words_df["word"].map(lambda word: str(word).isalpha())]
words_df = words_df.applymap(lambda word: str(word).strip().lower())

words_df = words_df.dropna()
words_df = words_df.drop_duplicates()

In [None]:
words_df.head()

In [None]:
dict_cv = sklearn.feature_extraction.text.CountVectorizer(analyzer="char", ngram_range=(3,5), min_df=0.00001, max_df=1.0)
words_counts_matrix = dict_cv.fit_transform(words_df["word"])

dict_counts = np.log(np.asarray(words_counts_matrix.sum(axis=0)).flatten())
words_ngrams_list = dict_cv.get_feature_names()

print(words_ngrams_list[100:200])

In [None]:
data["words_grams"] = dict_counts * dict_cv.transform(data["domain"]).T
data.head()

In [None]:
data[data["label"] == "legit"].head(10)

In [None]:
mean_alexa_grams = data.groupby("label")["alexa_grams"].mean()
sns.barplot(mean_alexa_grams.index, mean_alexa_grams.values, palette="viridis")

In [None]:
mean_words_grams = data.groupby("label")["words_grams"].mean()
sns.barplot(mean_words_grams.index, mean_words_grams.values, palette="viridis")

In [None]:
sns.scatterplot(data=data, x="length", y="alexa_grams", hue="label")

In [None]:
sns.scatterplot(data=data, x="entropy", y="alexa_grams", hue="label")

In [None]:
sns.scatterplot(data=data, x="length", y="words_grams", hue="label")

In [None]:
sns.scatterplot(data=data, x="entropy", y="words_grams", hue="label")

In [None]:
X_dash = data.drop(["label", "domain_name", "domain"], axis=1)
y_dash = data["label"]

X_dash_train, X_dash_test, y_dash_train, y_dash_test = train_test_split(X_dash, y_dash, train_size=0.8, test_size=0.2, shuffle=True, stratify=y_dash)

In [None]:
lb_dash = LabelEncoder()

y_dash_train = lb_dash.fit_transform(y_dash_train)
y_dash_test = lb_dash.transform(y_dash_test)

In [None]:
lgb_clf_dash = lgb.LGBMClassifier()

lgb_clf_dash.fit(X_dash_train, y_dash_train)
y_dash_pred = lgb_clf_dash.predict(X_dash_test)

In [None]:
print("Evaluation f1 score metric is {}%".format(round(f1_score(y_dash_test, y_dash_pred) * 100), 2))

In [None]:
plot_confusion_matrix(lgb_clf_dash, X_dash_test, y_dash_test, display_labels=["legit", "dga"])