In [12]:
#Machine learning / Anti-malware using Decision Tree classifier
#Source: https://www.youtube.com/watch?v=Kf9VD1os_pY&list=PL2iM-fIRjbTBFazzQ5uEzeASpmP8o40y1&index=58

import pandas as pd
import numpy as np
import pickle
import sklearn.ensemble as ske
from sklearn import tree, linear_model
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.externals import joblib

#importing the dataset
data = pd.read_csv('https://raw.githubusercontent.com/akky2892/Machine-Learning-The-future/master/Anti-Malware%20Using%20Machine%20Learning/data.csv', sep = '|')
X = data.drop(['Name', 'md5', 'legitimate'], axis = 1).values
y = data['legitimate'].values


print('Researching important feature based on %i total features\n' % X.shape[1])

#Feature seelction using Trees Classifier
fsel = ske.ExtraTreesClassifier().fit(X, y)
model = SelectFromModel(fsel, prefit = True)
X_new = model.transform(X)
nb_features = X_new.shape[1]

#Splitting the dataset into the Training and Test set
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size = 0.2)

features = []


print('\n%i features identified as important:' % nb_features)

indices = np.argsort(fsel.feature_importances_)[::-1][:nb_features]

for f in range(nb_features):
    print("%d. feature %s (%f)" % (f + 1, data.columns[2+indices[f]], fsel.feature_importances_[indices[f]]))

for f in sorted(np.argsort(fsel.feature_importances_)[::-1][:nb_features]):
    features.append(data.columns[2+f])
    
#Algorithm comparison
algorithms = {
    'DecisionTree': tree.DecisionTreeClassifier(max_depth = 10), 
    'RandomForest': ske.RandomForestClassifier(n_estimators = 50)
}

results = {}
print('Now testing algorithms...')

#Fitting Classification algorithms to the training set
for algo in algorithms:
    clf = algorithms[algo]
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    print("%s : %f %%" % (algo, score*100))
    results[algo] = score
    
winner = max(results, key = results.get)

print('\nWinner algorithm is %s with a %f %% success' % (winner, results[winner]*100))

# #Save the algorithm and the feature list for later predictions
# print('Saving the algorithm and feature list in classifier directory...')

# joblib.dump(algorithms[winner], 'classifier/classifier.pkl')
# open('classifier/features.pkl', 'w').write(pickle.dump(features))
# print('Saved')

#Predicting the test set results
y_pred = clf.predict(X_test)

#Making the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)


Researching important feature based on 54 total features






11 features identified as important:
1. feature DllCharacteristics (0.225751)
2. feature SectionsMaxEntropy (0.075424)
3. feature Subsystem (0.074557)
4. feature Machine (0.068546)
5. feature MajorSubsystemVersion (0.066613)
6. feature Characteristics (0.066214)
7. feature ResourcesMaxEntropy (0.066202)
8. feature SizeOfOptionalHeader (0.060112)
9. feature VersionInformationSize (0.049440)
10. feature MajorOperatingSystemVersion (0.033555)
11. feature ImageBase (0.026292)
Now testing algorithms...
DecisionTree : 99.029337 %
RandomForest : 99.340819 %

Winner algorithm is RandomForest with a 99.340819 % success
