In [66]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from io import StringIO
from sklearn.impute import SimpleImputer

In [67]:
malwareData=pd.read_csv("MalwareDataset.csv")

In [68]:
malwareData.head()

Unnamed: 0,hash,millisecond,classification,state,usage_counter,prio,static_prio,normal_prio,policy,vm_pgoff,...,nivcsw,min_flt,maj_flt,fs_excl_counter,lock,utime,stime,gtime,cgtime,signal_nvcsw
0,42fb5e2ec009a05ff5143227297074f1e9c6c3ebb9c914...,0,malware,0,0,3069378560,14274,0,0,0,...,0,0,120,0,3204448256,380690,4,0,0,0
1,42fb5e2ec009a05ff5143227297074f1e9c6c3ebb9c914...,1,malware,0,0,3069378560,14274,0,0,0,...,0,0,120,0,3204448256,380690,4,0,0,0
2,42fb5e2ec009a05ff5143227297074f1e9c6c3ebb9c914...,2,malware,0,0,3069378560,14274,0,0,0,...,0,0,120,0,3204448256,380690,4,0,0,0
3,42fb5e2ec009a05ff5143227297074f1e9c6c3ebb9c914...,3,malware,0,0,3069378560,14274,0,0,0,...,0,0,120,0,3204448256,380690,4,0,0,0
4,42fb5e2ec009a05ff5143227297074f1e9c6c3ebb9c914...,4,malware,0,0,3069378560,14274,0,0,0,...,0,0,120,0,3204448256,380690,4,0,0,0


In [69]:
malwareData.shape

(100000, 35)

In [70]:
malwareData.describe()

Unnamed: 0,millisecond,state,usage_counter,prio,static_prio,normal_prio,policy,vm_pgoff,vm_truncate_count,task_size,...,nivcsw,min_flt,maj_flt,fs_excl_counter,lock,utime,stime,gtime,cgtime,signal_nvcsw
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,...,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,499.5,157768.3,0.0,3069706000.0,18183.90007,0.0,0.0,0.0,15312.73951,0.0,...,32.99116,2.05313,117.92024,1.10919,3204448000.0,385415.45197,4.05931,1.66142,0.0,0.0
std,288.676434,936172.6,0.0,296306.1,4609.792765,0.0,0.0,0.0,3256.475008,0.0,...,52.730176,13.881382,3.116892,2.160466,0.0,10144.036494,0.822848,3.26304,0.0,0.0
min,0.0,0.0,0.0,3069190000.0,13988.0,0.0,0.0,0.0,9695.0,0.0,...,0.0,0.0,112.0,0.0,3204448000.0,371782.0,3.0,0.0,0.0,0.0
25%,249.75,0.0,0.0,3069446000.0,14352.0,0.0,0.0,0.0,12648.0,0.0,...,1.0,0.0,114.0,0.0,3204448000.0,378208.0,3.0,0.0,0.0,0.0
50%,499.5,0.0,0.0,3069698000.0,16159.0,0.0,0.0,0.0,15245.0,0.0,...,9.0,1.0,120.0,0.0,3204448000.0,383637.0,4.0,0.0,0.0,0.0
75%,749.25,4096.0,0.0,3069957000.0,22182.0,0.0,0.0,0.0,17663.0,0.0,...,46.0,1.0,120.0,1.0,3204448000.0,390324.0,5.0,1.0,0.0,0.0
max,999.0,43266050.0,0.0,3070222000.0,31855.0,0.0,0.0,0.0,27157.0,0.0,...,365.0,256.0,120.0,18.0,3204448000.0,421913.0,7.0,15.0,0.0,0.0


In [71]:
malwareData.columns

Index(['hash', 'millisecond', 'classification', 'state', 'usage_counter',
       'prio', 'static_prio', 'normal_prio', 'policy', 'vm_pgoff',
       'vm_truncate_count', 'task_size', 'cached_hole_size', 'free_area_cache',
       'mm_users', 'map_count', 'hiwater_rss', 'total_vm', 'shared_vm',
       'exec_vm', 'reserved_vm', 'nr_ptes', 'end_data', 'last_interval',
       'nvcsw', 'nivcsw', 'min_flt', 'maj_flt', 'fs_excl_counter', 'lock',
       'utime', 'stime', 'gtime', 'cgtime', 'signal_nvcsw'],
      dtype='object')

# Data Cleaning

In [72]:
import sklearn
from sklearn.feature_selection import SelectFromModel
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier,GradientBoostingClassifier,AdaBoostClassifier 
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,accuracy_score,plot_confusion_matrix,auc,confusion_matrix

In [73]:
malware_mapping = {'malware':1, 'benign':0}
malwareData['classification'] = malwareData['classification'].map(malware_mapping)
malwareData['classification'].value_counts()

1    50000
0    50000
Name: classification, dtype: int64

In [74]:
data = malwareData.drop(['hash', 'classification', 'vm_truncate_count', 'shared_vm', 'exec_vm', 'nvcsw', 'maj_flt', 'utime'], axis=1)
data = data.dropna(how='any', axis=0)
target = malwareData['classification']
target.head()

0    1
1    1
2    1
3    1
4    1
Name: classification, dtype: int64

# Running Classifiers

In [75]:
X_train, X_test, y_train, y_test = train_test_split(data,target, test_size=0.2, random_state=42)

In [76]:
rf = RandomForestClassifier(n_estimators=50)
rf.fit(X_train, y_train)
score_rf = rf.score(X_test, y_test)

In [78]:
print(score_rf * 100)

100.0


In [79]:
featselect = ExtraTreesClassifier().fit(data,target)
model = SelectFromModel(featselect, prefit=True)
new_data = model.transform(data)



In [80]:
print(data.shape)

(100000, 27)


In [81]:
print(new_data.shape)

(100000, 12)
