In [64]:
# Name: dog-tor
# Date: March 9th, 2021
# 
# This program uses the panda library modules and will read and manipulate csv data in multiple ways.

# Import the required modules.
import pandas as pd # series and dataframe
import numpy as np

# Change the view to allow more columns and rows.
pd.options.display.max_colwidth = 1000
pd.options.display.max_rows = 1000

# Load the first 1000 rows of the file.
pandaData=pd.read_csv("train.csv", nrows = 1000) 

# Show the total number of missing values in all variables
print("The total number of missing values in all variables is:", pandaData.isnull().sum().sum(),"\n\n")

# Print summary statistics for numerical attributes, and print value counts for categorical attributes.
for var_name in pandaData.columns:
    print("-"*50)
    print(var_name)
    if pandaData[var_name].dtype == np.int32:
        print(pandaData[var_name].value_counts())
    if pandaData[var_name].dtype == np.int64:
        print(pandaData[var_name].describe())

# Columns of dtype object are imputed with the most frequent value in the column. Columns of other types are imputed with mean of the column.
# Missing values imputation
from sklearn.base import TransformerMixin

class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.

        """
    def fit(self, pandaData, y=None):

        self.fill = pd.Series([pandaData[c].value_counts().index[0]
            if pandaData[c].dtype == np.dtype('O') else pandaData[c].mean() for c in pandaData],
            index=pandaData.columns)

        return self

    def transform(self, pandaData, y=None):
        return pandaData.fillna(self.fill)
pandaData = DataFrameImputer().fit_transform(pandaData)

# Show the total number of missing values again in all variables. Now, after missing value imputation, it should be 0
print("\n\nThe total number of missing values in all variables is now:", pandaData.isnull().sum().sum(),"\n\n")

# Do string encoding (i.e, encode string values to integers)
# encode string values to integers
from sklearn.preprocessing import LabelEncoder
# maintain a dict for string to integer mappings for each column
label_dict = dict()
for var_name in pandaData.columns:
    # only map string values
    if pandaData[var_name].dtype == np.object:
        le = LabelEncoder()
        # map the string values
        pandaData[var_name] = le.fit_transform(pandaData[var_name])
        # store the mapping in the dict
        label_dict[var_name] = dict()
        for cls, label in zip(le.classes_, le.transform(le.classes_)):
            label_dict[var_name][label] = cls

# Second, modeling:
# Split the data into training and testing sets (80-20)
from sklearn.model_selection import train_test_split
attributes = [col for col in pandaData.columns if col != "HasDetections"]
train_x, test_x, train_y, test_y = train_test_split(pandaData[attributes], pandaData["HasDetections"], test_size=0.3, random_state=123)

# Building a predictive model like Decision Tree. Show the performance of the model (f1, accuracy, precision, and recall) 
from sklearn import tree
clf = tree.DecisionTreeClassifier()
# train/build model
clf = clf.fit(train_x, train_y)
# make prediction
pred_y = clf.predict(test_x) 
# evaluate the prediction results
from sklearn.metrics import classification_report
for line in classification_report(test_y, pred_y).split("\n"):
    print (line)


The total number of missing values in all variables is: 6741 


--------------------------------------------------
MachineIdentifier
--------------------------------------------------
ProductName
--------------------------------------------------
EngineVersion
--------------------------------------------------
AppVersion
--------------------------------------------------
AvSigVersion
--------------------------------------------------
IsBeta
count    1000.0
mean        0.0
std         0.0
min         0.0
25%         0.0
50%         0.0
75%         0.0
max         0.0
Name: IsBeta, dtype: float64
--------------------------------------------------
RtpStateBitfield
--------------------------------------------------
IsSxsPassiveMode
count    1000.000000
mean        0.018000
std         0.133018
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: IsSxsPassiveMode, dtype: float64
--------------------------------------------------
Defa