# Final Project Notebook
Group: 9
Group Members: Shawn Ericksen (ericksen@uark.edu), Garret Fulghum (gmfulghu@uark.edu), Wesley Parker (wgparker@uark.edu)

This practice project focuses on the Microsoft Malware Prediction dataset. This can be accessed from: https://www.kaggle.com/competitions/microsoft-malware-prediction/data

Below is all of our specified imports

In [1]:
import numpy as np
import pandas as pd
import csv

# from scipy import sparse

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import CategoricalNB
from sklearn.svm import SVC

import joblib

from sklearn import set_config
set_config(display='diagram')

In [2]:
dtypes={'id': 'string', 'date_account_created': 'string', 'timestamp_first_active': 'string', 'date_first_booking': 'string', 'gender': 'category', 'age': 'float64', 'signup_method': 'category', 'signup_flow': 'category', 'language': 'category', 'affiliate_channel': 'category', 'affiliate_provider': 'category', 'first_affiliate_tracked': 'category', 'signup_app': 'category', 'first_device_type': 'category', 'first_browser': 'category', 'country_destination': 'category'}
parse_dates = ['date_account_created', 'timestamp_first_active', 'date_first_booking']

In [3]:
filepath = "airbnb-recruiting-new-user-bookings/train_users_2.csv"
cols = list(pd.read_csv(filepath, nrows=1))[1:]
df = pd.read_csv(filepath, dtype=dtypes, na_values=['-unknown-', 'NDF', '<NA>'], usecols =[i for i in cols if i != 'id'], parse_dates=parse_dates, infer_datetime_format=True)
df['date_first_booking'] = pd.to_datetime(pd.Series(df['date_first_booking']), format='%Y-%m-%d', errors='coerce')
for i in list(df):
    if df.dtypes[i] == 'datetime64[ns]':
        print(i)
        df[i].fillna(pd.NaT)

date_account_created
timestamp_first_active
date_first_booking


In [None]:
print("- - Adult.Data - -")
print("Number of lines present: ", len(df))
print("Number of Columns: ", len(df.columns))

In [None]:
topCount = 5
print("Top ", topCount, " dataFrames:")
print(df.head(topCount))

In [4]:
df.memory_usage(deep=True, index=False).sort_values(ascending=False)

date_account_created       1707608
timestamp_first_active     1707608
date_first_booking         1707608
age                        1707608
first_browser               218965
language                    215482
affiliate_provider          215157
signup_flow                 215002
country_destination         214403
first_device_type           214373
affiliate_channel           214264
first_affiliate_tracked     214206
signup_app                  213869
signup_method               213749
gender                      213745
dtype: int64

In [5]:
df.memory_usage(deep=True, index=False).sum()

9193647

## This section is for developing the codebase since true lables for test.csv are not given.

In [6]:
X = df.drop(columns=['country_destination'])
X = X.drop(columns=parse_dates)
y = df['country_destination']

In [7]:
# This block can be commented out when doing prediction on the Kaggle test.csv
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.37, random_state=0, shuffle=True)

In [8]:
numeric_transformer = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler()
)

encoder = make_column_transformer(
    (numeric_transformer, ['age']),
    (OneHotEncoder(sparse=False, handle_unknown='ignore'), make_column_selector(dtype_include='category')),
    remainder='passthrough'
)
encoder

In [9]:
encoder.fit_transform(X_train)
X_train.shape

(134474, 11)

In [12]:
y_enc = LabelEncoder().fit(y_train.values)
y_train = y_enc.transform(y_train)
y_test = y_enc.transform(y_test.values)

## Test Section

In [15]:
clf1 = DecisionTreeClassifier(random_state=0)

pipeline = make_pipeline(encoder, clf1)
pipeline

In [16]:
# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Score the pipeline on the testing data
pipeline.score(X_test, y_test)

0.5816250300720462

In [None]:
# Pickle model and write to hard drive
joblib.dump(pipeline, "models/CategoricalNB.pkl")

### Classifiers

In [None]:
clf1 = CategoricalNB()
clf1.fit(X_train, y_train)

In [None]:
y_pred = clf1.predict(X_test)
print(classification_report(y_test, y_pred, target_names=X.columns[:-2]))

In [None]:
clf2 = SVC().fit(X_train, y_train)

In [None]:
y_pred = clf2.predict(X_test)
print(classification_report(y_test, y_pred, target_names=X.columns[:-2]))

In [None]:
# clf3 = KMeans(n_clusters=3, max_iter=100, random_state=0)
# clf3.fit(X_train, y_train)

## Predictions for Kaggle's test.csv

In [None]:
# filepath = "test.csv"
# cols = list(pd.read_csv(filepath, nrows=1))
# ddf = dd.read_csv(filepath, dtype=dtypes, blocksize="16MB", na_values=['NA', 'UNKNOWN', 'NOT_SET', 'nan'])

In [None]:
# ddf_test = ddf.categorize()

In [None]:
# id_list = list(ddf['MachineIdentifier'])
# X_kaggle = encoder.transform(ddf.drop(columns=['MachineIdentifier']))

In [None]:
# y_pred = clf1.predict(X_kaggle)

In [None]:
# with open("submission.csv", "w", newline="") as csvfile:
#     csv_writer = csv.writer(csvfile, delimiter=',', quotechar='"')
#     csv_writer.writerows(zip(id_list, y_pred))