In [34]:
import pandas as pd
import numpy as np 
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
import re
import warnings
warnings.filterwarnings('ignore')

In [35]:
dataset = pd.read_csv('dga_domains_sample.csv')

In [36]:
dataset.columns=['Type','DGA_family','Domain']

In [37]:
dataset.head()

Unnamed: 0,Type,DGA_family,Domain
0,dga,corebot,cvyh1po636avyrsxebwbkn7.ddns.net
1,legit,alexa,plasticbags.sa.com
2,legit,alexa,mzltrack.com
3,legit,alexa,miss-slim.ru
4,dga,ranbyus,txumyqrubwutbb.cc


In [38]:
dataset.shape

(9999, 3)

In [39]:
x=dataset.iloc[:, 1:3] #DGA_Family,Domain
y=dataset.iloc[:,0] #Type
x

Unnamed: 0,DGA_family,Domain
0,corebot,cvyh1po636avyrsxebwbkn7.ddns.net
1,alexa,plasticbags.sa.com
2,alexa,mzltrack.com
3,alexa,miss-slim.ru
4,ranbyus,txumyqrubwutbb.cc
...,...,...
9994,murofet,k37f12b28pza37kvfth54gydvgqayoujxdreu.biz
9995,padcrypt,bkfoebdlccafmfbe.org
9996,fobber,mdoqihhgij.com
9997,alexa,portinhola.com.br


In [40]:
dataset = dataset.dropna(how='any',axis=0) 

In [41]:
dataset.isnull().sum()

Type          0
DGA_family    0
Domain        0
dtype: int64

In [42]:
def preprocess_column(col_value):
    col_value = re.sub('[^a-zA-Z]', ' ', col_value)
    col_value = col_value.lower()
    col_value = col_value.split()
    col_value = ' '.join(col_value)
    return col_value

In [43]:
cv = CountVectorizer(max_features = 200)

In [44]:
preprocessed_domain = []
for domain in dataset['Domain']:
    preprocessed = preprocess_column(domain)
    preprocessed_domain.append(preprocessed)

In [45]:
x_new = cv.fit_transform(preprocessed_domain).toarray()

In [46]:
le = LabelEncoder()
dataset['Type'] = le.fit_transform(dataset['Type'])
dga_family = dataset['DGA_family']
new_x = pd.DataFrame(x_new, dga_family)

In [47]:
y = dataset['Type']
y

0       0
1       1
2       1
3       1
4       0
       ..
9994    0
9995    0
9996    0
9997    1
9998    0
Name: Type, Length: 9999, dtype: int32

In [48]:
x_train,x_test,y_train,y_test = train_test_split(new_x,y,test_size = 0.2, random_state = 0)

In [49]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(7999, 200)
(7999,)
(2000, 200)
(2000,)


In [50]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier
rfm=RandomForestClassifier(n_estimators=75,oob_score=True,n_jobs=-1, random_state=20,max_features=None,min_samples_leaf=2)
rfm.fit(x_train,y_train)
rfm_ypred=rfm.predict(x_test)

In [51]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
lr.fit(x_train,y_train)
lr_ypred=lr.predict(x_test)

In [52]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
dtree=DecisionTreeClassifier(max_depth=40,random_state=18,max_features=None, min_samples_leaf=8)
dtree.fit(x_train,y_train)
dtree_ypred=dtree.predict(x_test)

In [53]:
# Random Forest
cm_rfm = confusion_matrix(y_test, rfm_ypred)
print('For Random Forest')
print('Confusion Matrix: ')
print(cm_rfm)
acc_rfm = accuracy_score(y_test, rfm_ypred)
pre_rfm = precision_score(y_test, rfm_ypred)
rec_rfm = recall_score(y_test, rfm_ypred)
print('Accuracy Score: ',acc_rfm)
print('Precision Score: ',pre_rfm)
print('Recall Score: ',rec_rfm)

For Random Forest
Confusion Matrix: 
[[586 420]
 [178 816]]
Accuracy Score:  0.701
Precision Score:  0.6601941747572816
Recall Score:  0.8209255533199196


In [54]:
#Confusion Matrix

# Logistic Regression
cm_lr = confusion_matrix(y_test, lr_ypred)
print('For Logistic Regression')
print('Confusion Matrix: ')
print(cm_lr)
acc_lr = accuracy_score(y_test, lr_ypred)
pre_lr = precision_score(y_test, lr_ypred)
rec_lr = recall_score(y_test, lr_ypred)
print('Accuracy Score: ',acc_lr)
print('Precision Score: ',pre_lr)
print('Recall Score: ',rec_lr)

For Logistic Regression
Confusion Matrix: 
[[594 412]
 [189 805]]
Accuracy Score:  0.6995
Precision Score:  0.6614626129827444
Recall Score:  0.8098591549295775


In [55]:
# Decision Tree
cm_dtree = confusion_matrix(y_test, dtree_ypred)
print('For Decision Tree')
print('Confusion Matrix: ')
print(cm_dtree)
acc_dtree = accuracy_score(y_test, dtree_ypred)
pre_dtree = precision_score(y_test, dtree_ypred)
rec_dtree = recall_score(y_test, dtree_ypred)
print('Accuracy Score: ',acc_dtree)
print('Precision Score: ',pre_dtree)
print('Recall Score: ',rec_dtree)

For Decision Tree
Confusion Matrix: 
[[539 467]
 [173 821]]
Accuracy Score:  0.68
Precision Score:  0.6374223602484472
Recall Score:  0.8259557344064387


In [56]:
domain_name = input("Enter a domain name: ")
preprocessed_input = preprocess_column(domain_name)
x_input = cv.transform([preprocessed_input]).toarray()
y_pred = rfm.predict(x_input)

if y_pred == 0:
    print("The domain is a DGA domain.")
else:
    print("The domain is a legitimate domain.")

The domain is a legitimate domain.


# Pickling the Model file for Deployment

In [57]:
model = RandomForestClassifier()
rfc = model.fit(x_train,y_train)
rfc_ypred= model.predict(x_test)

In [58]:
import pickle

In [59]:
pickle.dump(model, open("dga_botnet.pkl", 'wb'))

In [60]:
pickled_model=pickle.load(open('dga_botnet.pkl', 'rb'))

In [61]:
pickled_model.predict(x_input)

array([1])