In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import pickle


# Metrics for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, make_scorer
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

# Classification Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

  import pandas.util.testing as tm


In [None]:
# dataset_url = 'datasets/urlset.csv'
# data = pd.read_csv(dataset_url)
# data = shuffle(data).reset_index(drop=True)
# #data.shape
# labels = data.iloc[:, lambda df: [-1]]
# data = data.loc[:,lambda df:['domain', 'ranking']]
# #labels.shape
# #X_train, X_test, Y_train, Y_test = train_test_split(data, labels, test_size=0.33, random_state=42)
# #X_train.shape
# #X_test.shape

In [2]:

featureset = '/content/urlsdataset.csv'
data = pd.read_csv(featureset)
sscaler = StandardScaler()
#data = sscaler.fit_transform(data)
labels = data.iloc[:, lambda df: [-1]]
data = data.iloc[:,1:-1]

train_data = data.iloc[0:76000,:]
labels_train = labels.iloc[0:76000,:]

test_data = data.iloc[76001:,:]
labels_test = labels.iloc[76001:,:]
labels_test.shape

(29928, 1)

In [3]:
def create_features(data_frame):
    url = data_frame['domain']
    len_url = []
    for u in url:
        len_url.append(len(u))

    data_frame['URL_length'] = pd.Series(len_url)
 
    count_slashes = []
    for u in url:
        count_slashes.append(u.count('//') )
    data_frame['URL_slashes_count'] = pd.Series(count_slashes)
    
    count_dots = []
    for u in url:
        count_dots.append(u.count('.') )
    data_frame['URL_dots_count'] = pd.Series(count_dots)

    len_hostname = []
    start = '://'
    end = '/'

    for u in url:
        temp = u[u.find(start)+2*len(start) + 2: u.rfind(end)]
        temp = temp.replace('/','.')
        temp = temp.replace('-','.')
        len_hostname.append(len(temp.split('.')))

    data_frame['URL_hostname_len'] = pd.Series(len_hostname)
    data_frame = data_frame.iloc[:,1:]
    return data_frame
#     total_word_count = 5000
#     tokenizer = Tokenizer(num_words=total_word_count)
#     tokenizer.fit_on_texts(clean_url)

#     seq_length = 5 #Number of items in each sequence
#     sequences = tokenizer.texts_to_sequences(clean_url)
#     data = pad_sequences(sequences, maxlen=seq_length)

#     num_data = phish_data[['create_age(months)', 'expiry_age(months)', 'update_age(days)', 'URL_length', 'URL_slashes', 'URL_dots', 'URL_host']].values
#     num_lab = phish_data["Label"].values

#     sscaler = StandardScaler()
#     num_data_scaled = sscaler.fit_transform(num_data)
#     num_data = num_data_scaled

In [4]:
def get_best_clf_after_CrossVal(train_x, train_y, classifier_parameter_map):
    scoring_evals = {'AUC': 'roc_auc',
                     'Accuracy': make_scorer(accuracy_score),
                     'f1': make_scorer(f1_score)
                    }
    clf_list=[]
    for key in classifier_parameter_map:
        clf = GridSearchCV(classifier_parameter_map[key][0],
                     classifier_parameter_map[key][1],
                     #iid=False,
                     scoring = 'accuracy',
                     #refit = 'Accuracy',
                     cv=10, # no of validations
                     n_jobs = -1 # use full concurrency
                    )
        
        result = clf.fit(train_x, train_y)
        #print(clf.cv_results_)
        print(result.best_estimator_)
        print('score: ', result.best_score_)
        clf_list.append(result.best_estimator_)
    return clf_list

In [5]:
def classify(train_x, train_y):
           
    logreg_parameters = {
     'penalty': ['l1','l2']
    }
    
    knn_parameters = {
       'n_neighbors': np.arange(2,100,1),
       'weights': ['distance'],
       'metric': ['minkowski']
    }
    dt_parameters = {
        'min_samples_split' : range(2,50,2),
        #'max_depth': range(1,50,2)
    }
    
    rf_parameters = {
    'bootstrap': [True],
    'max_depth': range(1,20,1),
    'n_estimators': range(2,100,1)
    }
    
    classifier_parameter_map = {"Log-Regression": (LogisticRegression(), logreg_parameters),
                          "K-Nearest Neighbors": (KNeighborsClassifier(), knn_parameters),
                          "Decision Tree": (DecisionTreeClassifier(), dt_parameters),
                          "Random Forest": (RandomForestClassifier(), rf_parameters)}
                                            #"AdaBoost"}
        
    clf_list = get_best_clf_after_CrossVal(train_x, train_y, classifier_parameter_map)
    print(len(clf_list))
    return clf_list

In [None]:
clf_list = classify(train_data, np.ravel(labels_train))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
score:  0.7570263157894737


In [7]:

classifiers = ['Log-Regression', 'K-Nearest Neighbors', 'Decision Tree', 'Random Forest']
for clfname, clf in zip(classifiers, clf_list):
    filename = str('../model_dump/') + str(clfname) + (str('.sav'))
    pickle.dump(clf, open(filename, 'wb'))

NameError: ignored

In [None]:
accuracies = []
for clfname, clf in zip(classifiers, clf_list):
        labels_pred = clf.predict(test_data)
        accuracies.append(accuracy_score(labels_test, labels_pred))
        print(clfname)
        print('Accuracy: ', accuracy_score(labels_test, labels_pred))
        print('Classification report')
        print(classification_report(labels_test, labels_pred, target_names=['Spam','Legitimate']))

In [None]:

print(accuracies)


plt.figure(figsize=(12, 8))
plt.plot(classifiers, accuracies,'ro',markersize=12)
plt.plot(classifiers, accuracies,color = 'blue', linestyle = 'dashed',linewidth=2, markersize=12)
plt.show()

In [None]:
train_data.head()

In [None]:
## too slow net  speed