In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
print(os.listdir("../input"))
import gc
gc.enable()

In [None]:
dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float16',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int16',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float32',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float16',
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float16',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'float16',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float16',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float16',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float16',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'float16',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'float16'
        }
train = pd.read_csv('../input/train.csv', dtype=dtypes,low_memory=True)

gc.collect()


In [None]:
perc_nan = train.isnull().sum()/len(train['HasDetections'])*100
unique = train.nunique()
nan_df = pd.DataFrame({#'column': train.columns,
                        'NAN_perc': perc_nan,
                        'unique': unique
})
nan_df.sort_values('NAN_perc', inplace=True, ascending=False)
nan_df

### Data Description
- PuaMode: Pua Enabled mode from the service
- Census_ProcessorClass: A classification of processors into high/medium/low. Initially used for Pricing Level SKU. No longer maintained and updated
- DefaultBrowsersIdentifier: ID for the machine's default browser
- Census_IsFlightingInternal: NA
- Census_InternalBatteryType: NA
- Census_ThresholdOptIn: NA
- Census_IsWIMBootEnabled: NA
- SmartScreen: This is the SmartScreen enabled string value from registry. This is obtained by checking in order, HKLM\SOFTWARE\Policies\Microsoft\Windows\System\SmartScreenEnabled and HKLM\SOFTWARE\Microsoft\Windows\CurrentVersion\Explorer\SmartScreenEnabled. If the value exists but is blank, the value "ExistsNotSet" is sent in telemetry
- OrganizationIdentifier - ID for the organization the machine belongs in, organization ID is mapped to both specific companies and broad industries

In [None]:
train.dropna(axis=1, thresh=int(0.6*len(train)), inplace=True)

In [None]:
train.drop(["Census_InternalPrimaryDiagonalDisplaySizeInInches", "Census_InternalPrimaryDisplayResolutionHorizontal", "Census_InternalPrimaryDisplayResolutionVertical"], axis=1, inplace=True)

In [None]:
gc.collect()

In [None]:
nan_df.sort_values(['NAN_perc','unique'],  ascending=False)

- IsProtected: This is a calculated field derived from the Spynet Report's AV Products field. Returns: a. TRUE if there is at least one active and up-to-date antivirus product running on this machine. b. FALSE if there is no active AV product on this machine, or if the AV is active, but is not receiving the latest updates. c. null if there are no Anti Virus Products in the report. Returns: Whether a machine is protected
- Firewall - This attribute is true (1) for Windows 8.1 and above if windows firewall is enabled, as reported by the service.
- Census_IsSecureBootEnabled - Indicates if Secure Boot mode is enabled.

# Univariate analysis

In [None]:
fig = train.hist(bins=15,
                 color='steelblue',
                 edgecolor='black', linewidth=1.0,
                 xlabelsize=10, ylabelsize=10,
                 xrot=45, yrot=0,
                 figsize=(20,18))

plt.tight_layout()   

In [None]:
#train.IsBeta.value_counts()
#train.HasTpm.value_counts()
#train.SMode.value_counts()
#train.groupby('Processor').HasDetections.plot.kde()
#train.Census_ProcessorModelIdentifier.value_counts()
train.CountryIdentifier.value_counts()

In [None]:
train.ProductName.describe()

# Bivariate analysis

In [None]:
target = ['HasDetections']
columns = train.columns
features_census = []
features = []
features_cat = list(train.select_dtypes(include=['category']).columns)
features_int8 = list(train.select_dtypes(include=['int8']).columns)
features_num =  list(train.select_dtypes(exclude=['int8', 'category']).columns)

for name in columns:
    if name.startswith('Census'):
        features_census.append(name)
    else:
        features.append(name)
len(features_census)
len(features)

## features category

In [None]:
features_cat

In [None]:
df_cat = train[features_cat].head()
df_cat.iloc[:,:15]

In [None]:
#Census_OSArchitecture
train.Census_OSSkuName.value_counts()

In [None]:
x = train[features_num].head()

In [None]:
x.iloc[:,20:]

In [None]:
#Census_IsFlightingInternal
#Census_ThresholdOptIn
#Census_IsWIMBootEnabled
#Census_IsVirtualDevice
#Census_IsAlwaysOnAlwaysConnectedCapable
#Wdft_IsGamer
#Firewall
#IsProtected
#SMode
train.Firewall.value_counts().nlargest()

In [None]:
train.groupby(['AVProductStatesIdentifier','HasDetections'])['IsBeta'].count()

In [None]:
train.groupby(['AvSigVersion','IsProtected'])['HasDetections'].count()
#train.AvSigVersion.value_counts().nlargest()

In [None]:
'ProductName'
'EngineVersion'
'AppVersion'
'AvSigVersion'
'IsBeta'

In [None]:
ax = sns.countplot(x="Platform", hue="HasDetections", data=train)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()

In [None]:
ax = sns.countplot(x="Processor", hue="HasDetections", data=train)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()

In [None]:
#train.groupby(['Processor','Census_ProcessorClass']).count()
#train['Census_ProcessorClass'] = train.Census_ProcessorClass.apply(lambda x: 'amd' if x == 'NaN' else x)
train.Census_ProcessorClass.isnull().sum() /len(train)

#train.Census_ProcessorClass.isna().sum()

In [None]:
x = train[train.Census_ProcessorClass.isna()]

In [None]:
x.groupby(['Census_ProcessorClass','Processor']).count()

In [None]:
#train.Census_ProcessorClass.fillna('amd', inplace=True)
#train.Census_ProcessorClass = train.Census_ProcessorClass.cat.add_categories("amd").fillna("amd")
sns.countplot(x=train.Census_ProcessorClass)

In [None]:
ax = sns.countplot(x="Processor", hue="HasDetections", data=train)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()

In [None]:
#train.SkuEdition.unique()
ax = sns.countplot(x="SkuEdition", hue="HasDetections", data=train)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()

# Selecting features

In [None]:
corr = train.corr()

In [None]:
f, ax = plt.subplots(figsize=(15, 11))

# Draw the heatmap using seaborn
sns.heatmap(corr, cmap="coolwarm",vmax=.8, square=False)
#sns.plt.title('MMP - Pearson Correlation');

In [None]:
#train.Census_ProcessorManufacturerIdentifier.value_counts()
train.Census_ProcessorModelIdentifier.value_counts()
#        '':                      'float16'

In [None]:
#train.Census_OSBuildNumber.value_counts()
#train.OsBuild.value_counts()
#train.Census_OSBuildRevision.value_counts()
#train.Census_OSSkuName.value_counts() 
train.UacLuaenable.value_counts() 

In [None]:
train.OsSuite.value_counts()

In [None]:
train.RtpStateBitfield.value_counts()
#train.IsSxsPassiveMode.value_counts()

In [None]:
#train.SkuEdition.unique()
ax = sns.countplot(x="RtpStateBitfield", hue="HasDetections", data=train)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()

In [None]:
train.RtpStateBitfield.fillna(train.RtpStateBitfield.median(), inplace=True)

In [None]:
train.RtpStateBitfield.describe()