# Final Project Notebook
Group: 9
Group Members: Shawn Ericksen (ericksen@uark.edu), Garret Fulghum (gmfulghu@uark.edu), Wesley Parker (wgparker@uark.edu)

This practice project focuses on the Microsoft Malware Prediction dataset. This can be accessed from: https://www.kaggle.com/competitions/microsoft-malware-prediction/data

Below is all of our specified imports

In [1]:
import numpy as np
import pandas as pd
from dask import dataframe as dd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder


In [2]:
def clean(df):
    for col in df.columns:
        if df[col].dtype == 'float16' or df[col].dtype == 'Int64':
            df[col] = (df[col] >= df.mean(numeric_only=True)[col])
    
    return df

In [3]:
def encode(train_data):
    df = clean(train_data)

    X_train = df.drop(columns=['HasDetections'])
    encoder = OneHotEncoder(sparse=False).fit(X_train)
    X_train = encoder.transform(X_train).toarray()
    y_train = df['HasDetections'].values
    y_enc = LabelEncoder().fit(y_train)
    y_train = y_enc.transform(y_train)
    
#     df = clean(test_data, include_continuous)
    
#     X_test = df.drop(columns=['income'])
#     X_test = encoder.transform(X_test).toarray()
#     y_test = y_enc.transform(df['income'].values)
    
    return X_train, y_train, y_enc

In [4]:
dtypes={'MachineIdentifier': 'category', 'ProductName': 'category', 'EngineVersion': 'category', 'AppVersion': 'category', 'AvSigVersion': 'category', 'IsBeta': 'category', 'RtpStateBitfield': 'category', 'IsSxsPassiveMode': 'category', 'DefaultBrowsersIdentifier': 'float16', 'AVProductStatesIdentifier': 'float16', 'AVProductsInstalled': 'Int64', 'AVProductsEnabled': 'Int64', 'HasTpm': 'category', 'CountryIdentifier': 'category', 'CityIdentifier': 'float16', 'OrganizationIdentifier': 'float16', 'GeoNameIdentifier': 'category', 'LocaleEnglishNameIdentifier': 'category', 'Platform': 'category', 'Processor': 'category', 'OsVer': 'category', 'OsBuild': 'category', 'OsSuite': 'category', 'OsPlatformSubRelease': 'category', 'OsBuildLab': 'category', 'SkuEdition': 'category', 'IsProtected': 'category', 'AutoSampleOptIn': 'category', 'PuaMode': 'category', 'SMode': 'category', 'IeVerIdentifier': 'float16', 'SmartScreen': 'category', 'Firewall': 'category', 'UacLuaenable': 'float16', 'Census_MDC2FormFactor': 'category', 'Census_DeviceFamily': 'category', 'Census_OEMNameIdentifier': 'category', 'Census_OEMModelIdentifier': 'category', 'Census_ProcessorCoreCount': 'category', 'Census_ProcessorManufacturerIdentifier': 'category', 'Census_ProcessorModelIdentifier': 'category', 'Census_ProcessorClass': 'category', 'Census_PrimaryDiskTotalCapacity': 'float64', 'Census_PrimaryDiskTypeName': 'category', 'Census_SystemVolumeTotalCapacity': 'float16', 'Census_HasOpticalDiskDrive': 'category', 'Census_TotalPhysicalRAM': 'float16', 'Census_ChassisTypeName': 'category', 'Census_InternalPrimaryDiagonalDisplaySizeInInches': 'float16', 'Census_InternalPrimaryDisplayResolutionHorizontal': 'float16', 'Census_InternalPrimaryDisplayResolutionVertical': 'float16', 'Census_PowerPlatformRoleName': 'category', 'Census_InternalBatteryType': 'category', 'Census_InternalBatteryNumberOfCharges': 'float16', 'Census_OSVersion': 'category', 'Census_OSArchitecture': 'category', 'Census_OSBranch': 'category', 'Census_OSBuildNumber': 'category', 'Census_OSBuildRevision': 'category', 'Census_OSEdition': 'category', 'Census_OSSkuName': 'category', 'Census_OSInstallTypeName': 'category', 'Census_OSInstallLanguageIdentifier': 'float16', 'Census_OSUILocaleIdentifier': 'category', 'Census_OSWUAutoUpdateOptionsName': 'category', 'Census_IsPortableOperatingSystem': 'category', 'Census_GenuineStateName': 'category', 'Census_ActivationChannel': 'category', 'Census_IsFlightingInternal': 'category', 'Census_IsFlightsDisabled': 'category', 'Census_FlightRing': 'category', 'Census_ThresholdOptIn': 'category', 'Census_FirmwareManufacturerIdentifier': 'category', 'Census_FirmwareVersionIdentifier': 'category', 'Census_IsSecureBootEnabled': 'category', 'Census_IsWIMBootEnabled': 'category', 'Census_IsVirtualDevice': 'category', 'Census_IsTouchEnabled': 'category', 'Census_IsPenCapable': 'category', 'Census_IsAlwaysOnAlwaysConnectedCapable': 'category', 'Wdft_IsGamer': 'category', 'Wdft_RegionIdentifier': 'category', 'HasDetections': 'category'}

In [5]:
cols = list(pd.read_csv("train.csv", nrows=1))

chunk = pd.read_csv("train.csv", dtype=dtypes, na_values=['UNKNOWN', 'NOT_SET', 'nan'], usecols =[i for i in cols if i != 'MachineIdentifier'], chunksize=100000)

df_train = pd.concat(chunk)

# for column in df_train.columns:
#     print(pd.api.types.infer_dtype(df[column]))
#     df_train[column] = df_train[column].apply(lambda x: pd.to_numeric(x, errors = 'ignore'))
#     df_train[column][df_train[column].apply(lambda x: isinstance(x, type))]

In [None]:
print("- - Adult.Data - -")
print("Number of lines present: ", len(df_train))
print("Number of Columns: ", len(df_train.columns))

In [None]:
topCount = 10
print("Top ", topCount, " dataFrames:")
print(df_train.head(topCount))

In [None]:
for x in range(0, len(df_train.dtypes)):
    print(str(df_train.columns[x]) + ": " + str(df_train.dtypes[x]) + ": " + str(df_train.to_numpy()[1][x]))

# s = "{"
# for x in range(0, len(df_train.dtypes)):
#     if str(df_train.dtypes[x]) == "Int16":
#         s += "'" + str(df_train.columns[x]) + "': 'Int64"
#     elif str(df_train.dtypes[x]) == "float64":
#         s += "'" + str(df_train.columns[x]) + "': 'float16"
#     else:
#         s += "'" + str(df_train.columns[x]) + "': '" + str(df_train.dtypes[x])
#     if x != len(df_train.dtypes) - 1:
#         s += "', "
#     else:
#         s += "'}"

# print(s)

In [None]:
# df_train = pd.read_csv("train.csv", header=0)

In [6]:
X0_train, y0_train, y0_enc = encode(df_train)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)


TypeError: Encoders require their input to be uniformly strings or numbers. Got ['NAType', 'bool']