# Machine learning classification algorithms

Data source: https://www.kaggle.com/ang3loliveira/malware-analysis-datasets-top1000-pe-imports

It is required to predict the value of a column 'malware'

## Reading a comma-separated value (csv) file into a DataFrame

In [1]:
import pandas

dataframe = pandas.read_csv("top_1000_pe_imports.csv")
dataframe = dataframe[:1000]

## Printing the first 5 lines

In [2]:
dataframe.head()

Unnamed: 0,hash,GetProcAddress,ExitProcess,WriteFile,GetLastError,CloseHandle,FreeLibrary,Sleep,GetStdHandle,MultiByteToWideChar,...,bind,RegEnumKeyExA,WinHttpOpen,_controlfp,WinExec,GetSecurityDescriptorDacl,FindFirstFreeAce,GetTimeFormatW,LookupAccountSidW,malware
0,071e8c3f8922e186e57548cd4c703a5d,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,1
1,33f8e6d08a6aae939f25a8e0d63dd523,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,1
2,b68abd064e975e1c6d5f25e748663076,1,1,1,1,1,0,1,0,1,...,0,1,0,0,0,0,0,0,0,1
3,72049be7bd30ea61297ea624ae198067,1,1,1,1,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,1
4,c9b3700a77facf29172f32df6bc77f48,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,1


## Replacing strings with their hash value

In [3]:
dataframe['hash'] = [abs(hash(i)) % (10**8) for i in dataframe['hash']]
dataframe.head()

Unnamed: 0,hash,GetProcAddress,ExitProcess,WriteFile,GetLastError,CloseHandle,FreeLibrary,Sleep,GetStdHandle,MultiByteToWideChar,...,bind,RegEnumKeyExA,WinHttpOpen,_controlfp,WinExec,GetSecurityDescriptorDacl,FindFirstFreeAce,GetTimeFormatW,LookupAccountSidW,malware
0,7824973,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,1
1,56415367,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,1
2,91158083,1,1,1,1,1,0,1,0,1,...,0,1,0,0,0,0,0,0,0,1
3,90314087,1,1,1,1,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,1
4,13277978,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,1


## Data normalization

In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

X = dataframe.drop("malware",axis=1).to_numpy()
X = MinMaxScaler().fit_transform(X) 

y = dataframe["malware"].copy().to_numpy()

## Function for fitting and evaluating machine learning models

In [5]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

In [6]:
def metrics_calc(model, X, y, n_splits=10, average='binary'):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True)
    precision = np.empty(n_splits, dtype=float)
    recall = np.empty(n_splits, dtype=float)
    accuracy = np.empty(n_splits, dtype=float)
    f1 = np.empty(n_splits, dtype=float)
    for i, (train_indexes, true_indexes) in enumerate(skf.split(X, y)):
        X_train = X[train_indexes]
        y_train = y[train_indexes]        
        X_true = X[true_indexes]
        y_true = y[true_indexes]
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_true)
        
        precision[i] = precision_score(y_true, y_pred, average=average)
        recall[i] = recall_score(y_true, y_pred, average=average)
        accuracy[i] = accuracy_score(y_true, y_pred)
        f1[i] = f1_score(y_true, y_pred, average=average)
    return [precision, recall, accuracy, f1]

## Import basic machine learning algorithms

In [7]:
from sklearn import linear_model # LogisticRegression
from sklearn import neighbors # KNeighborsClassifier
from sklearn import svm # SVC
from sklearn import tree # DecisionTreeClassifier
from sklearn import ensemble # RandomForestClassifier

## Implementing logistic regression

In [8]:
class LogisticRegression():
    def __init__(self, eta=1, max_iter=10, fit_intercept=True):
        self.eta = eta # learning rate
        self.max_iter = max_iter
        self.fit_intercept = fit_intercept
        
    def add_intercept(self, X):
        ones = np.ones((X.shape[0], 1))
        X = np.concatenate((ones, X), axis=1)
        return X
        
    def sigmoid(self, X):
        return 1.0 / (1.0 + np.exp(-X))
    
    def fit(self, X, y):   
        if self.fit_intercept:
            X = self.add_intercept(X)
        m = X.shape[0] # number of records
        n = X.shape[1] # number of features
        self.theta = np.random.randn(n) # weights
        for i in range(self.max_iter):
            h = self.sigmoid(np.dot(X, self.theta))
            grad = 2 * np.dot(X.T, (h - y)) / m
            self.theta -= self.eta * grad
    
    def predict(self, X):
        if self.fit_intercept:
            X = self.add_intercept(X)
        y_pred = np.dot(X, self.theta)
        y_pred = self.sigmoid(y_pred)
        y_pred = np.where(y_pred >= 0.5,1,0)
        return y_pred

## Fitting and evaluating logistic regression models

In [9]:
lr = linear_model.LogisticRegression(solver='lbfgs', max_iter=100)
metrics = metrics_calc(lr, X, y)
print("Logistic regression from sklearn.linear_model")
print("Precision score is %0.3f (+- %0.3f)"%(metrics[0].mean(), metrics[0].std()))
print("Recall score is %0.3f (+- %0.3f)"%(metrics[1].mean(), metrics[1].std()))
print("Accuracy score is %0.3f (+- %0.3f)"%(metrics[2].mean(), metrics[2].std()))
print("F1 score is %0.3f (+- %0.3f)\n"%(metrics[3].mean(), metrics[3].std()))

lr = LogisticRegression(max_iter=100)
metrics = metrics_calc(lr, X, y)
print("My implementation of logistic regression")
print("Precision score is %0.3f (+- %0.3f)"%(metrics[0].mean(), metrics[0].std()))
print("Recall score is %0.3f (+- %0.3f)"%(metrics[1].mean(), metrics[1].std()))
print("Accuracy score is %0.3f (+- %0.3f)"%(metrics[2].mean(), metrics[2].std()))
print("F1 score is %0.3f (+- %0.3f)\n"%(metrics[3].mean(), metrics[3].std()))

Logistic regression from sklearn.linear_model
Precision score is 0.966 (+- 0.012)
Recall score is 0.989 (+- 0.009)
Accuracy score is 0.957 (+- 0.013)
F1 score is 0.978 (+- 0.007)

My implementation of logistic regression
Precision score is 0.970 (+- 0.014)
Recall score is 0.975 (+- 0.014)
Accuracy score is 0.947 (+- 0.018)
F1 score is 0.972 (+- 0.009)



## Implementing k-Nearest Neighbors

In [10]:
class KNeighborsClassifier():
    def __init__(self, n_neighbors=20):
        self.n_neighbors = n_neighbors
    
    def fit(self, X, y):
        self.X = X
        self.y = y
        
    def distanceMetric(self, x, y):
        nx = x.size
        d = sum(abs(x[i] - y[i]) for i in range(nx))
        return d
    
    def predict(self, X_true):
        classes = list(set(self.y))
        m_train = self.X.shape[0] # X_train
        m_true = X_true.shape[0]
        n = X_true.shape[1]
        y_pred = np.empty(m_true)
        for i in range(m_true): # m
            D = [self.distanceMetric(X_true[i], x) for x in self.X] # m
            indices = np.argsort(D)
            # Относительное большинство
            indices = indices[:self.n_neighbors]
            num_objs_of_each_class = [np.sum(self.y[indices] == c) for c in classes]
            position = np.argmax(num_objs_of_each_class)
            y_pred[i] = classes[position]
        return y_pred
    

## Fitting and evaluating k-Nearest Neighbors models

In [11]:
knn = neighbors.KNeighborsClassifier(n_neighbors=3)
metrics = metrics_calc(knn, X, y)
print("K-nearest neighbors from sklearn.neighbors")
print("Precision score is %0.3f (+- %0.3f)"%(metrics[0].mean(), metrics[0].std()))
print("Recall score is %0.3f (+- %0.3f)"%(metrics[1].mean(), metrics[1].std()))
print("Accuracy score is %0.3f (+- %0.3f)"%(metrics[2].mean(), metrics[2].std()))
print("F1 score is %0.3f (+- %0.3f)\n"%(metrics[3].mean(), metrics[3].std()))

knn = KNeighborsClassifier(n_neighbors=3)
metrics = metrics_calc(knn, X, y)
print("My implementation of k-nearest neighbors")
print("Precision score is %0.3f (+- %0.3f)"%(metrics[0].mean(), metrics[0].std()))
print("Recall score is %0.3f (+- %0.3f)"%(metrics[1].mean(), metrics[1].std()))
print("Accuracy score is %0.3f (+- %0.3f)"%(metrics[2].mean(), metrics[2].std()))
print("F1 score is %0.3f (+- %0.3f)\n"%(metrics[3].mean(), metrics[3].std()))

K-nearest neighbors from sklearn.neighbors
Precision score is 0.978 (+- 0.011)
Recall score is 0.967 (+- 0.011)
Accuracy score is 0.948 (+- 0.006)
F1 score is 0.973 (+- 0.003)

My implementation of k-nearest neighbors
Precision score is 0.975 (+- 0.007)
Recall score is 0.966 (+- 0.023)
Accuracy score is 0.944 (+- 0.022)
F1 score is 0.970 (+- 0.012)



## Implementing support vector machine (SVM)

In [12]:
import math
class SVM:
    def __init__(self, alpha=0.001, eta=0.1, max_iter=50, tol=0.0001, fit_intercept=True):
        self.alpha = alpha
        self.eta = eta
        self.max_iter = max_iter
        self.tol = tol
        self.fit_intercept = fit_intercept
        
    def add_intercept(self, X):
        ones = np.ones((X.shape[0], 1))
        X = np.concatenate((ones, X), axis=1)
        return X
    
    def fit(self, X, y, verbose=False):
        if self.fit_intercept:
            X = self.add_intercept(X)
        y = np.where(y,1,-1)
        m = X.shape[0]
        n = X.shape[1]
        self.theta = np.random.randn(n)        
        prev_loss = math.inf
        for iteration in range(self.max_iter):
            loss = 0
            for i, x in enumerate(X):
                margin = y[i] * np.dot(x, self.theta)
                grad = self.alpha * self.theta / self.max_iter
                if margin < 1:
                    grad -= y[i] * x
                self.theta -= self.eta * grad
                loss += self.soft_margin_loss(x, y[i])
            if abs(prev_loss - loss) <= self.tol:
                return
            prLoss = loss
            
    def predict(self, X):
        if self.fit_intercept:
            X = self.add_intercept(X)
        y_pred = np.sign(np.dot(X, self.theta))
        y_pred = np.where(y_pred == 1,1,0)
        return y_pred   
    
    def hinge_loss(self, X, y):
        return max(0, 1 - np.dot(y, np.dot(X, self.theta)))
    
    def soft_margin_loss(self, X, y):
        return self.hinge_loss(X, y) + self.alpha * np.dot(self.theta, self.theta) / 2

## Fitting and evaluating support vector machine (SVM) models

In [13]:
sv = svm.SVC(kernel='poly')
metrics = metrics_calc(sv, X, y)
print("Support vector from sklearn.svm")
print("Precision score is %0.3f (+- %0.3f)"%(metrics[0].mean(), metrics[0].std()))
print("Recall score is %0.3f (+- %0.3f)"%(metrics[1].mean(), metrics[1].std()))
print("Accuracy score is %0.3f (+- %0.3f)"%(metrics[2].mean(), metrics[2].std()))
print("F1 score is %0.3f (+- %0.3f)\n"%(metrics[3].mean(), metrics[3].std()))

sv = SVM()
metrics = metrics_calc(sv, X, y)
print("My implementation of support vector")
print("Precision score is %0.3f (+- %0.3f)"%(metrics[0].mean(), metrics[0].std()))
print("Recall score is %0.3f (+- %0.3f)"%(metrics[1].mean(), metrics[1].std()))
print("Accuracy score is %0.3f (+- %0.3f)"%(metrics[2].mean(), metrics[2].std()))
print("F1 score is %0.3f (+- %0.3f)\n"%(metrics[3].mean(), metrics[3].std()))

Support vector from sklearn.svm
Precision score is 0.953 (+- 0.006)
Recall score is 0.992 (+- 0.017)
Accuracy score is 0.945 (+- 0.013)
F1 score is 0.972 (+- 0.007)

My implementation of support vector
Precision score is 0.980 (+- 0.016)
Recall score is 0.964 (+- 0.015)
Accuracy score is 0.947 (+- 0.017)
F1 score is 0.972 (+- 0.009)



## Implementing decision tree

In [14]:
class Node():
    def __init__(self, predicted_class):
        self.predicted_class = predicted_class
        self.column = 0
        self.threshold = 0
        self.left = None
        self.right = None

class DecisionTreeClassifier():
    def __init__(self, max_depth=20):
        self.max_depth = max_depth
        
    def split(self, X, y, num_parent_objs):
        m = X.shape[0]
        min_gini = 1.0 - sum((x / m) ** 2 for x in num_parent_objs)
        col_split, threshold_split = None, None
        for col in self.columns:
            indices = np.argsort(X[:, col])
            thresholds = X[:, col][indices]
            classes = y[indices]
            num_left_objs = np.zeros(len(self.classes))
            num_right_objs = num_parent_objs.copy()
            for i in range(1, m):
                c = classes[i - 1]
                num_left_objs[c] += 1
                num_right_objs[c] -= 1
                if thresholds[i] == thresholds[i - 1]:
                    continue
                gini_left = 1.0 - sum((x / i) ** 2 for x in num_left_objs)
                gini_right = 1.0 - sum((x / (m - i)) ** 2 for x in num_right_objs)
                gini = i * gini_left / m + (m - i) * gini_right / m
                if min_gini > gini:
                    min_gini = gini
                    col_split = col
                    threshold_split = (thresholds[i] + thresholds[i - 1]) / 2
        return col_split, threshold_split
        
    def build_tree(self, X, y, depth=0):
        num_objs = [np.sum(y == i) for i in self.classes]        
        predicted_class = np.argmax(num_objs)
        node = Node(predicted_class=predicted_class)
        if depth < self.max_depth:
            col_split, threshold_split = self.split(X, y, num_parent_objs=num_objs)
            if col_split:
                node.column = col_split
                node.threshold = threshold_split
                indices_left = np.where(X[:, col_split] < threshold_split)
                indices_right = np.where(X[:, col_split] >= threshold_split)
                X_left, y_left = X[indices_left], y[indices_left]
                X_right, y_right = X[indices_right], y[indices_right]
                node.left = self.build_tree(X_left, y_left, depth + 1)
                node.right = self.build_tree(X_right, y_right, depth + 1)
        return node
    
    def fit(self, X, y):
        self.classes = set(y)
        m = X.shape[0]
        n = X.shape[1]
        self.columns = np.random.randint(low=0, high=n-1, size=n)
        size = np.random.randint(low=2,high=m-1)
        indices = np.random.randint(low=0, high=m-1, size=size)
        X = X[indices]
        y = y[indices]
        self.root = self.build_tree(X, y)

    def iterative_tree_search(self, row):
        node = self.root
        while node.left:
            if row[node.column] < node.threshold:
                node = node.left
            else:
                node = node.right
        return node.predicted_class
    
    def predict(self, X):
        y_pred = np.array([self.iterative_tree_search(row) for row in X])
        return y_pred

## Fitting and evaluating decision tree models

In [15]:
dt = tree.DecisionTreeClassifier(max_depth=20)
metrics = metrics_calc(dt, X, y)
print("Decision tree from sklearn.tree")
print("Precision score is %0.3f (+- %0.3f)"%(metrics[0].mean(), metrics[0].std()))
print("Recall score is %0.3f (+- %0.3f)"%(metrics[1].mean(), metrics[1].std()))
print("Accuracy score is %0.3f (+- %0.3f)"%(metrics[2].mean(), metrics[2].std()))
print("F1 score is %0.3f (+- %0.3f)\n"%(metrics[3].mean(), metrics[3].std()))

dt = DecisionTreeClassifier(max_depth=20)
metrics = metrics_calc(dt, X, y)
print("My implementation of decision tree")
print("Precision score is %0.3f (+- %0.3f)"%(metrics[0].mean(), metrics[0].std()))
print("Recall score is %0.3f (+- %0.3f)"%(metrics[1].mean(), metrics[1].std()))
print("Accuracy score is %0.3f (+- %0.3f)"%(metrics[2].mean(), metrics[2].std()))
print("F1 score is %0.3f (+- %0.3f)\n"%(metrics[3].mean(), metrics[3].std()))

Decision tree from sklearn.tree
Precision score is 0.972 (+- 0.010)
Recall score is 0.972 (+- 0.019)
Accuracy score is 0.946 (+- 0.021)
F1 score is 0.972 (+- 0.011)

My implementation of decision tree
Precision score is 0.970 (+- 0.012)
Recall score is 0.975 (+- 0.021)
Accuracy score is 0.947 (+- 0.016)
F1 score is 0.972 (+- 0.008)



## Implementing random forest

In [16]:
class RandomForestClassifier():
    def __init__(self, max_depth=20, n_estimators=20):
        self.max_depth = max_depth
        self.n_estimators = n_estimators
        self.forest = [DecisionTreeClassifier(max_depth=self.max_depth) for i in range(self.n_estimators)]

    def fit(self, X, y):
        self.classes = set(y)
        for i in range(self.n_estimators):
            self.forest[i].fit(X, y)

    def predict(self, X):
        m = X.shape[0]
        n = X.shape[1]
        y_preds = np.array([self.forest[i].predict(X) for i in range(self.n_estimators)])
        y_pred = np.empty(m, dtype=int)
        for i in range(m):
            num_objs = [np.sum(y_preds.T[i] == j) for j in self.classes]
            predicted_class = np.argmax(num_objs)
            y_pred[i] = predicted_class
        return y_pred

## Fitting and evaluating random forest models

In [17]:
rf = ensemble.RandomForestClassifier(max_depth=20, n_estimators=5)
rf = metrics_calc(rf, X, y)
print("Random forest from sklearn.ensemble")
print("Precision score is %0.3f (+- %0.3f)"%(metrics[0].mean(), metrics[0].std()))
print("Recall score is %0.3f (+- %0.3f)"%(metrics[1].mean(), metrics[1].std()))
print("Accuracy score is %0.3f (+- %0.3f)"%(metrics[2].mean(), metrics[2].std()))
print("F1 score is %0.3f (+- %0.3f)\n"%(metrics[3].mean(), metrics[3].std()))

rf = RandomForestClassifier(max_depth=20, n_estimators=5)
metrics = metrics_calc(rf, X, y)
print("My implementation of random forest")
print("Precision score is %0.3f (+- %0.3f)"%(metrics[0].mean(), metrics[0].std()))
print("Recall score is %0.3f (+- %0.3f)"%(metrics[1].mean(), metrics[1].std()))
print("Accuracy score is %0.3f (+- %0.3f)"%(metrics[2].mean(), metrics[2].std()))
print("F1 score is %0.3f (+- %0.3f)\n"%(metrics[3].mean(), metrics[3].std()))

Random forest from sklearn.ensemble
Precision score is 0.970 (+- 0.012)
Recall score is 0.975 (+- 0.021)
Accuracy score is 0.947 (+- 0.016)
F1 score is 0.972 (+- 0.008)

My implementation of random forest
Precision score is 0.969 (+- 0.012)
Recall score is 0.976 (+- 0.013)
Accuracy score is 0.947 (+- 0.009)
F1 score is 0.972 (+- 0.005)

