# Windows Malware Prediction
### Second round.

In [None]:
import random
import numpy as np
from datetime import datetime

import pandas as pd
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn import tree
from sklearn import metrics


plt.style.use('ggplot')
pd.set_option('display.max_rows', 100)

RANDOM_STATE = 42
ROWS = 500000
TARGET = "HasDetections"

#PARA TRATAR LOS DATOS EN BLOQUE
limit = 300 # Cantidad de valores únicos (columnas tras OHE)
start = 50 # Numero de muestras minimo por valor unico.
step = 10 # Step dentro del bucle para ir aumentando la cantidad de muestras.

In [None]:
dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float32',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int16',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float64', # was 'float32'
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float32', # was 'float16'
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float32', # was 'float16'
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float64', # was 'float32'
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float64', # was 'float32'
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float32', # was 'float16'
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float32', # was 'float16'
        'Census_InternalPrimaryDisplayResolutionVertical':      'float32', # was 'float16'
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float64', # was 'float32'
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }

We imported just 28 columns.

In [None]:
data_cols = [
    'MachineIdentifier',
    'ProductName',
    'EngineVersion',
    'AppVersion',
    'AvSigVersion',
    'RtpStateBitfield',
    'AVProductStatesIdentifier',
    'AVProductsInstalled',
    'AVProductsEnabled',
    'HasTpm',
    'Census_OSVersion',
    'Census_OSArchitecture',
    'Census_OSBranch',
    'Census_OSBuildNumber',
    'Census_OSBuildRevision',
    'Census_OSEdition',
    'IsProtected',
    'IeVerIdentifier',
    'SmartScreen',
    "DefaultBrowsersIdentifier",
    "IeVerIdentifier",
    'Firewall',
    "OsBuildLab",
    "OsPlatformSubRelease",
    'Census_ProcessorCoreCount',
    'Census_ProcessorManufacturerIdentifier',
    'Census_ProcessorModelIdentifier',
    'Census_PrimaryDiskTotalCapacity',
    'Census_PrimaryDiskTypeName',
    'Census_TotalPhysicalRAM',
    'HasDetections',
    'CountryIdentifier'
    ]


In [None]:
dtypes_data = dtypes.copy()
for key in dtypes_data.keys():
    if key not in data_cols:
        dtypes.pop(key)

In [None]:
datafile = "../input/microsoft-malware-prediction/train.csv"

In [None]:
df = pd.read_csv(
            datafile, 
            nrows=ROWS, 
            skiprows = lambda i: i % 4, 
            low_memory=False, 
            dtype=dtypes,
            usecols=data_cols
        )

In [None]:
df.info(verbose=False)

In [None]:
df.head(1).T

 MachineIdentifier as index.

In [None]:
df.set_index("MachineIdentifier", inplace=True)

HasDetections is our Target.

In [None]:
plt.figure(figsize=(6, 5))
p = sns.countplot(
    data=df,
    x = TARGET
)

In [None]:
df[TARGET].value_counts()

### Some good functions!!!

In [None]:
# This one returns the boolean variables
def isBool(nodel=False):
    bool_cols = [col for col in df if np.isin(df[col].dropna().unique(), [0, 1]).all()]
    if nodel:
        bool_cols.remove(nodel)
    return bool_cols

# A OHE generator
def generate_dummies(dataframe, column_name):
    _dummy_dataset = pd.get_dummies(dataframe[column_name], prefix=column_name)
    dataframe = pd.concat([dataframe, _dummy_dataset], axis=1)
    dataframe.drop(column_name, axis=1, inplace=True)
    del _dummy_dataset
    return dataframe

# Explore return a dataframe with number of rows per uniq value and the mean based on TARGET
def explore_cat_values(dataframe, column, target_column):
    _results_df = dataframe.pivot_table(index=column, values=target_column, aggfunc=[len, np.mean])
    _results_df.columns = ['n_machines', 'mean_detection']
    _results_df['pct_rows'] = dataframe[column].value_counts(normalize=True, dropna=False)
    _results_df = _results_df[['n_machines', 'pct_rows', 'mean_detection']]
    return _results_df

# Select the top values based on the rows per value
def setOthers(dataframe, column, num_values):
    top_categories = dataframe[column].value_counts().head(num_values)
    top_categories_list = top_categories.index.to_list()
    top_categories_list.append('Others')
    dataframe[column] = pd.Categorical(dataframe[column], categories=top_categories_list)
    return dataframe[column].fillna('Others')

# Select the top and the bottom values based on mean detection.
def setOthersPataNegra(dataframe, column, target_column, num_rows_min, top_n):
    results_by_category = explore_cat_values(dataframe, column, target_column)
    last_categories = results_by_category[results_by_category['n_machines'] > num_rows_min].sort_values(by='mean_detection').head(top_n).index.to_list()
    first_categories = results_by_category[results_by_category['n_machines'] > num_rows_min].sort_values(by='mean_detection').tail(top_n).index.to_list()
    top_categories_list = first_categories + last_categories
    top_categories_list.append('Others')
    dataframe[column] = pd.Categorical(dataframe[column], categories=top_categories_list)
    return dataframe[column].fillna('Others')

# A null filler
def simpleNaN(column, filler):    
    df[column].fillna(filler, inplace=True)
    return None

# A (dumb) null filler with strategy
# strat_value=df[i].mode()[0]  --  MyImputer(df,i,strat_value)
def MyImputer(dataframe, column, strategy):
    dataframe[column] = dataframe[column].fillna(strategy)
    return None

# Do MinMaxScaler from sklearn.processing
def minmaxScal(dataframe, column):
    minMaxResul = MinMaxScaler()
    dataframe[column] = minMaxResul.fit_transform(dataframe[column].values.reshape(-1,1))
    return minMaxResul

# Do StandarScaler from sklearn.processing
def stScal(dataframe, column):
    stResult = StandardScaler()
    dataframe[column] = stResult.fit_transform(dataframe[column].values.reshape(-1,1))
    return stResult

# Create a new column with the frecuency of each value
def frequency(dataframe, column, new_column):
    dataframe[new_column] = dataframe.groupby(by=column)[column].transform('count')

### Check numerical variables. We need to understand what kind of information is included in each one.

In [None]:
df.describe(include=np.number).T

In [None]:
frequency(df,"CountryIdentifier","CountryIdentifier_new")

In [None]:
df[['CountryIdentifier_new', "CountryIdentifier"]].sort_values(by="CountryIdentifier_new", ascending=False).head(5)

In [None]:
df.drop(columns=["CountryIdentifier"], inplace=True)

In [None]:
num_vars = df.select_dtypes(include=np.number).columns
num_vars = num_vars.drop("HasDetections")

In [None]:
num_vars

In [None]:
df["AvSigVersion"].head(10)

### Some of the variables aren't numerical because cleary are versions or boolean columns, so we will deal with them as categorical ones
First of all we have booleans variables, checking it:

In [None]:
bools_vars = isBool(TARGET)

### I will fill with 0 because not value means 0.

In [None]:
for i in bools_vars:
    nnan=df[i].isnull().sum()
    print("{} has {} nulls".format(i, nnan))
    if nnan > 0:
        simpleNaN(i, 0)
    print(df[i].value_counts(),"\n")

In [None]:
df.describe(include=np.number).T

For me just 5 columns of this limited dataset are truly numerical:

In [None]:
true_num=[
    "Census_ProcessorCoreCount",
    "Census_PrimaryDiskTotalCapacity",
    "Census_TotalPhysicalRAM",
    "CountryIdentifier_new"
    ]

In [None]:
for i in true_num:
    nnan=df[i].isnull().sum()
    if nnan > 0:
        strat_value=df[i].mode()[0]  
        MyImputer(df,i,strat_value)

### Numerical columns doesn't have a normal distribution, but for trees it's not so important we will keep it at is

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(16, 10))
axes = axes.flat
for i, colum in enumerate(true_num):
    sns.boxenplot(
        data    = df,
        x       = colum,
        ax      = axes[i]
    )
    axes[i].set_title(colum, fontsize = 12)
    axes[i].tick_params(labelsize = 6)
    axes[i].set_xlabel("")
    
    
fig.tight_layout()
plt.subplots_adjust(top = 0.9)
fig.suptitle('Distribución variables numéricas', fontsize = 16, fontweight = "bold");

### Now check what to do with false numerical

#### Change the dtype for those variables that i think aren't truly numerical, we will deal with them later

In [None]:
#for x in [i for i in num_vars if i not in true_num]:
#    df[x] = df[x].astype("str")        

### In this point we have numerical variables without Nulls

In [None]:
df.describe(include=np.number).T

### From this point We will deal with Categorical ones + Non True numerical

In [None]:
df.describe(exclude=np.number).T

In [None]:
cat_vars = list(df.select_dtypes(exclude=np.number).columns.values)

In [None]:
print(type(cat_vars), type(non_num_vars))

In [None]:
non_num_vars = [i for i in num_vars if i not in true_num]
all_cat_vars = non_num_vars + cat_vars
all_cat_vars

In [None]:
for x in all_cat_vars:
    limiti = limit
    nnan = df[x].isnull().sum()
    nuniq = df[x].nunique()
   
    if nnan == 0 and nuniq <= limiti:
        print("1 - {} Nulls: {} Uniq: {}".format(x ,nnan, nuniq))
        
        if is_numeric_dtype(df[x]):
            print("{} from {} to STR".format(x, df[x].dtypes))
            df[x] = df[x].astype('str')
        
    elif nnan > 0 and nuniq <= limiti:
        
        _df_x = explore_cat_values(df, x, TARGET)
        s = _df_x[_df_x["n_machines"] > start].sort_values(by="mean_detection").count()
        print("2 - {} Nulls: {} Uniq: {}".format(x ,nnan, nuniq))
        
        if is_numeric_dtype(df[x]):
            print("{} from {} to STR".format(x, df[x].dtypes))
            df[x] = df[x].astype('str')
        df[x] = setOthers(df, x, start)
        
    else:
        _df_x = explore_cat_values(df, x, TARGET)
        s = _df_x[_df_x["n_machines"] > start].sort_values(by="mean_detection").count()
        
        if s.n_machines > limiti:
            while s.n_machines > limiti:
                start = start + step
                s = _df_x[_df_x["n_machines"] > start].sort_values(by="mean_detection").count()
            top_n = s.n_machines // 2 - 2
            print("3 - {} Nulls: {} Uniq: {} TopN: {} Categories: {}".format(x ,nnan, nuniq, top_n, s.n_machines))
        else:
            top_n = s.n_machines // 2 - 1
            print("3 - {} Nulls: {} Uniq: {} TopN: {} Categories: {}".format(x ,nnan, nuniq, top_n, s.n_machines))
            
        if is_numeric_dtype(df[x]):
            print("{} from {} to STR".format(x, df[x].dtypes))
            df[x] = df[x].astype('str')
            
        df[x] = setOthersPataNegra(df, x, TARGET, start, top_n)
        

In [None]:
df.describe(exclude=np.number).T

### EDA Complete. Our DataFrame is ready to OHE!
We will create a new DataFrame called **df_final**. May be we will need to return to this point :(

In [None]:
df_final = df.copy(deep=True)

In [None]:
for column in df_final.select_dtypes(exclude=np.number).columns:
    print("Doing {} dtype: {}".format(column, df_final[column].dtypes))
    df_final = generate_dummies(df_final, column)

In [None]:
df_final.shape

### Let's starts with the Model
1. Split our data in development and validation
1. Split our development in train and test
1. Model instance
1. Fit (trainining) the model
1. Evaluate the chosen model (predict)
1. Check against validation


In [None]:
df_dev, df_val = train_test_split(df_final, test_size = 0.20, random_state = RANDOM_STATE)

In [None]:
print("Desarrollo df_dev: {}\nValidación df_val: {}".format(df_dev.shape, df_val.shape ))

In [None]:
df_dev_y = df_dev[[TARGET]]
df_dev_X = df_dev.drop(TARGET, axis=1)

In [None]:
df_val_y = df_val[[TARGET]]
df_val_X = df_val.drop(TARGET, axis=1)

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    df_dev_X, 
    df_dev_y,
    test_size = 0.20,
    random_state = RANDOM_STATE
    )

In [None]:
dt = DecisionTreeClassifier(max_depth=4,
                            random_state=RANDOM_STATE
                            )

In [None]:
dt.fit(
    X_train, 
    y_train
)

In [None]:
plt.figure(figsize=(30,10))
_ = tree.plot_tree(decision_tree = dt,
               feature_names = X_train.columns, 
               class_names = ['Safe', 'Unsafe'],
               filled = True)

In [None]:
y_test_prediction = pd.DataFrame(dt.predict(X_test), index=X_test.index, columns=['Prediction'])

In [None]:
results_df = y_test.join(y_test_prediction)

In [None]:
results_df['Success'] = (results_df[TARGET] == results_df['Prediction']).astype(int)

In [None]:
print('Hemos acertado {} registros de {}, con una media de acierto de {}'.format(
    results_df['Success'].sum(),
    results_df['Success'].count(),
    results_df['Success'].mean() * 100
))

In [None]:
confusion_matrix = pd.crosstab(results_df[TARGET],results_df['Prediction'])
confusion_matrix

In [None]:
TP = confusion_matrix.iloc[1,1]
TN = confusion_matrix.iloc[0,0]
FP = confusion_matrix.iloc[0,1]
FN = confusion_matrix.iloc[1,0]

In [None]:
accuracy = (TP + TN) / (TP + TN + FP + FN)
print("Accuracy: {}".format(accuracy))

In [None]:
dt.score(X_test, y_test)

In [None]:
dt.score(df_val_X, df_val_y)

In [None]:
Precision = TP / (TP + FP)
Recall = TP / (TP + FN)

In [None]:
f1_score = 2 / ( 1/Precision + 1/Recall )
f1_score

In [None]:
metrics.f1_score(y_test, y_test_prediction)

In [None]:
print(metrics.roc_auc_score(results_df['HasDetections'], results_df['Prediction']))
rocauc_dtree=metrics.roc_auc_score(results_df['HasDetections'], results_df['Prediction'])

In [None]:
RANDOM_STATE = 42
n_estimators = 150
max_depth = 4

models = [ 
    ('DecisionTree', DecisionTreeClassifier(max_depth=max_depth, random_state=RANDOM_STATE)),
    ('RandomForest', RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=RANDOM_STATE)),
    ('GradientBoosting', GradientBoostingClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=RANDOM_STATE))
]

In [None]:
plt.figure(figsize=(8,8))
plt.clf()
for model in models:
    model_name = model[0]
    model_instance = model[1]
    model_instance.fit(X_train, np.ravel(y_train))
    predictions = model_instance.predict_proba(X_test)[:,1]
    auc_score = metrics.roc_auc_score(y_test, predictions)
    print('ROC AUC Score for {}: {}'.format(model_name, auc_score))
    fpr, tpr, _ = metrics.roc_curve(y_test, predictions)
    plt.plot(fpr, tpr, label='ROC Curve for {} - Area: {:2f}'.format(model_name, auc_score))
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.legend(loc="lower right")
plt.title('ROC curve')
plt.show()