## Feature selection using CMI

In [72]:
import pickle
import pandas as pd
import operator

In [83]:
with open(r"featureMIs.pickle", "rb") as input_file:
    MI = pickle.load(input_file)
    
with open(r"entropies.pickle", "rb") as input_file:
    entropy = pickle.load(input_file)
    
with open(r"MutualInfos.pickle", "rb") as input_file:
    MI_target = pickle.load(input_file)
# del MI_target['HasDetections']

In [74]:
print(len(MI),len(entropy))
nums = ['Census_ProcessorCoreCount',
        'Census_PrimaryDiskTotalCapacity',
        'Census_SystemVolumeTotalCapacity',
        'Census_TotalPhysicalRAM',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches',
        'Census_InternalPrimaryDisplayResolutionHorizontal',
        'Census_InternalPrimaryDisplayResolutionVertical',
        'Census_InternalBatteryNumberOfCharges']
for var in nums:
    del MI_target[var]

3240 80


In [75]:
# In order to optimize memory usage it is useful to make certain features get certain types
# the dictionary below has been taken from https://www.kaggle.com/theoviel/load-the-totality-of-the-data
dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float32',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int16',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float64', # was 'float32'
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float32', # was 'float16'
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float32', # was 'float16'
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float64', # was 'float32'
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float64', # was 'float32'
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float32', # was 'float16'
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float32', # was 'float16'
        'Census_InternalPrimaryDisplayResolutionVertical':      'float32', # was 'float16'
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float64', # was 'float32'
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }
malware = pd.read_csv('../train.csv', dtype=dtypes)

In [77]:
MI_cols = [col for col in malware.columns if col not in nums and col != 'MachineIdentifier']
len(MI_cols)

74

## Forward selection algorithm (greedy feature selection)
Initialize set of selected variables $S={\emptyset}$ and $X$ the set of all variables being considered for analysis.

**Step 1.** *Selection of the first feature*

   Find feature $X^* \in X$ that maximizes $I(C,X_i)$;
   
   set $X = X\setminus \{X^*\}, S=\{X^*\}.$ Where $C$ is the target variable $C$.
   
**Step 2.** *greedy feature selection*

Find feature $X^+ \in X$ according to:

$$X^+ = \text{argmax}_{X_i\in X\setminus S}\left\{I(C,X_i) - \max_{X_s\in S}{CU_{X_i,X_s}I(C,X_s)}\right\}$$

Where $CU_{X_i,X_s}= \cfrac{I(X_i,X_s)}{H(X_s)}$

and finally, set $X = X\setminus \{X^+\}, S=S\cup\{X^+\}.$

**Step 3.** Go to step 2.

In [78]:
# utiliy functions
def get_MI(X_i,X_s):
    val = 0
    try:
        val =  MI[f"{X_i},{X_s}"]
    except:
        val =  MI[f"{X_s},{X_i}"]
    
    return val
def CU(X_i,X_s):
    '''
    X_i,X_s: labels of columns representing the variables, respectively.
    
    '''
    mi = get_MI(X_i,X_s)
    H  = entropy[X_s]
    
    return mi/H

In [84]:
Selected_features = []
features          = MI_cols[:-1]

# step 1
Selected_features.append(max(MI_target.items(), key=operator.itemgetter(1))[0])

# step 2

def get_inner_max(X_i,Selected):
    max_val=-1e10
    for X_s in Selected:
        value = CU(X_i,X_s)*MI_target[X_s]
        if value >max_val:
            max_val = value
            
    return max_val

def get_next_var(Selected):
    best_var = None
    max_val  = -1e10
    for X_i in features:
        if X_i in Selected:
            continue
        value = MI_target[X_i] - get_inner_max(X_i,Selected)
        if value >max_val:
            max_val  = value
            best_var = X_i
    
    return best_var

In [85]:
#to define threshold
criteria = True
n_vars_selected =1
while criteria:
    
    next_var = get_next_var(Selected_features)
    Selected_features.append(next_var)
    
    n_vars_selected+=1
    if n_vars_selected >10:
        break

In [86]:
Selected_features

['Census_SystemVolumeTotalCapacity',
 'Census_PowerPlatformRoleName',
 'Census_ChassisTypeName',
 'Census_InternalBatteryType',
 'SmartScreen',
 'AVProductStatesIdentifier',
 'EngineVersion',
 'AVProductsInstalled',
 'AppVersion',
 'Census_OEMModelIdentifier',
 'Processor']

# extra things


In [152]:
# lightgbm for classification
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import train_test_split
from matplotlib import pyplot
import re


#using sample of dataframe
malware_sample = malware.sample(10000)
malware_sample.drop('MachineIdentifier', axis=1, inplace=True)
malware_sample.drop('Census_InternalBatteryType', axis=1, inplace=True)
# define dataset
malware_d = pd.get_dummies(malware_sample,prefix=malware_sample.select_dtypes(include='category').columns.values)
malware_d = malware_d.rename(columns={'OsVer_10.0.48.0': 'OsVer_10.0.48.00'})
malware_d = malware_d.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

X, y = malware_d.loc[:, malware_d.columns != 'HasDetections'], malware_d["HasDetections"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
# # evaluate the model
model = LGBMClassifier(n_estimators = 200)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
# fit the model 
model = LGBMClassifier()
model.fit(X_train, y_train)
# # make predictions

yhat = model.predict(X_test)
print('accuracy on test: %f' % np.mean(yhat==y_test.values))

Accuracy: 0.610 (0.014)
error rate: 0.632500
