In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.options.display.float_format = '{:,.2f}'.format

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
%matplotlib inline

## Predicting Red Hat Business Value

### How Can We Identify a Potential Customer?

In [None]:
df = pd.read_csv("/kaggle/input/predicting-red-hat-business-value/act_train.csv.zip", parse_dates=['date'])
print(df.shape)

In [None]:
people = pd.read_csv("/kaggle/input/predicting-red-hat-business-value/people.csv.zip", parse_dates=['date'])
print(people.shape)

## Explore the contents of the 'activity' dataset

In [None]:
df.head()

In [None]:
df.sample(5, random_state=16)

In [None]:
df.tail()

### Missing Values 

Around nine features have more than 90% null values

In [None]:
#Calculating the % of Null values in each column for activity data
pd.DataFrame(df.isnull().sum()/df.shape[0], columns=['Null Value %']).T

Around **nine** features have more than 90% null values. We can't do much to fixe these features.

In [None]:
activity_df = df[['people_id', 'activity_id', 'date', 'activity_category', 'char_10', 'outcome']].copy()
activity_df.loc[:, activity_df.dtypes=='object'] = activity_df.select_dtypes('object').apply(lambda x: x.astype('category'))

In [None]:
activity_df.info()

In [None]:
# Rename the 2 columns to avoid name clashes in merged data
activity_df = activity_df.rename(columns={'date':'activity_date', 'char_10':'activity_type'})

In [None]:
# Replace nulls in the activity_type column with the mode
activity_df.activity_type = activity_df.activity_type.fillna(activity_df.activity_type.mode()[0])

In [None]:
# Print the shape of the final activity dataset
print("Shape of Activity DF:", activity_df.shape)

We can now join the two datasets to create a consolidate activity and customer attributes dataset

## Explore the contents of the 'customer' dataset

In [None]:
people.head().T

### Missing Values

None of the columns in the customer dataset has missing values.

In [None]:
#Calculating the % of Null values in each column for activity data
pd.DataFrame(people.isnull().sum()/df.shape[0], columns=['Null Value %']).sum()

In [None]:
people.loc[:, people.dtypes=='object'] = people.select_dtypes('object').apply(lambda x: x.astype('category'))
people.loc[:, people.dtypes=='bool'] = people.select_dtypes('bool').apply(lambda x: x.astype('category'))

In [None]:
people.info()

In [None]:
# Merge the 2 datasets on 'people_id' key
df_new = activity_df.merge(people, on=["people_id"], how="inner")
print("Shape before merging:",df.shape)
print("Shape after merging :",df_new.shape)

We can see that there is a good mix in the distribution of potential customers, as around 45% are potential customers

In [None]:
print("Unique values for outcome:",df_new["outcome"].unique())
print("\nPercentage of distribution for outcome-")
print(df_new["outcome"].value_counts()/df_new.shape[0])

## Data Engineering

In [None]:
df_new[['date', 'activity_date']].describe(datetime_is_numeric=True)

In [None]:
date_ref = np.datetime64("2020-01-01")
df_new['date_Day'] = (df_new.date - date_ref).dt.days
df_new['activity_date_Day'] = (df_new.activity_date - date_ref).dt.days
df_new[['date','date_Day', 'activity_date', 'activity_date_Day']]

Let us now have a look at the remaining categorical columns, which have very high numbers of distinct values.

It seems that we can convert all of the preceding categorical columns into numeric by extracting the relevant numeric ID from each of them, 

since each of these columns has values in the form of *someText_someNumber*. 

Rather than converting these categorical columns into a bloated one-hot encoded dataset, we can temporarily use them as numeric features. 

However, if the performance of the model doesn’t reach our desired expectations after several experiments, we might have to revisit these features 

and try our best to incorporate them differently. But for now, we can consider them as numeric features

In [None]:
print(df_new[["people_id","activity_type","activity_id", "group_1"]].head())

In [None]:
# For people ID, we would need to extract values after '_'
df_new.people_id = df_new.people_id.apply(lambda x: x.split("_")[1])
df_new.people_id = pd.to_numeric(df_new.people_id)

# For activity ID also, we would need to extract values after '_'
df_new.activity_id = df_new.activity_id.apply(lambda x: x.split("_")[1])
df_new.activity_id = pd.to_numeric(df_new.activity_id)

# For group_1 , we would need to extract values after ' '
df_new.group_1 = df_new.group_1.apply(lambda x: x.split(" ")[1])
df_new.group_1 = pd.to_numeric(df_new.group_1)

# For activity_type , we would need to extract values after ' '
df_new.activity_type = df_new.activity_type.apply(lambda x: x.split(" ")[1])
df_new.activity_type = pd.to_numeric(df_new.activity_type)

# Double check the new values in the dataframe
print(df_new[["people_id","activity_type","activity_id", "group_1"]].head())

##  One Hot Encoding

In [None]:
categorical = [f"char_{l}" for l in np.arange(1,38)] + ['activity_category']
numerical = ['people_id', 'activity_id', 'activity_type', 'group_1', 'char_38', 'date_Day', 'activity_date_Day']

In [None]:
X = df_new[categorical  + numerical].values
print("\nShape of final df after onehot encoding:",X.shape)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
X = df_new[categorical  + numerical].values
color_ohe = OneHotEncoder(categories='auto', drop='first')
c_tranf = ColumnTransformer([
    ('onehot', color_ohe, np.arange(0,38)),
    ('nothing', 'passthrough', np.arange(38, 45))
])
X_tranf = c_tranf.fit_transform(X).astype(float)
print("\nShape of final df after onehot encoding:",X_tranf.shape)

## Split Datasets 60:20:20

Finally, before we begin with the model development, we need to split our datasets into train, validation, and test

In [None]:
from sklearn.model_selection import train_test_split

X = X_tranf
y = df_new[['outcome']].values

X_train_full, X_test, y_train_full, y_test = train_test_split(X, y,
                                                    stratify=y, 
                                                    test_size=0.2, random_state=16)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full,
                                                    stratify=y_train_full, 
                                                    test_size=0.25, random_state=16)

# Check the shape of each new dataset created
print("Shape of X_train: ",X_train.shape)
print("Shape of y_train: ",y_train.shape)
print("\nShape of X_val: ",X_val.shape)
print("Shape of y_val: ",y_val.shape)
print("\nShape of X_test: ",X_test.shape)
print("Shape of y_test: ",y_test.shape)

## Defining Model Baseline Accuracy

We can see that there is a good mix in the distribution of potential customers, as around 45% are potential customers in each partition like in the population.

We can say that if we do not have any model and make all predictions as 0 (the largest class)

—that is, predicting that none of the customers are potential high-value customers— then we would end up with at least 55.6% accuracy either way. 

This is our baseline accuracy. If we build a model that delivers us an overall accuracy anywhere below our benchmark, then it would be of practically no use.

In [None]:
unique, counts = np.unique(y_train, return_counts=True)
print("Unique values for outcome:",unique)

print("\nPercentage of distribution for outcome in the training")
result = np.column_stack((unique, counts/y_train.shape[0])).round(3) 
print (result)

unique, counts = np.unique(y_val, return_counts=True)
print("\nPercentage of distribution for outcome in the validation")
result = np.column_stack((unique, counts/y_val.shape[0])).round(3) 
print (result)

unique, counts = np.unique(y_test, return_counts=True)
print("\nPercentage of distribution for outcome in the validation")
result = np.column_stack((unique, counts/y_test.shape[0])).round(3) 
print (result)

## Designing the DNN for Classification

The following code snippet builds a *DNN* with just one layer and 256 neurons. 

We choose *binary_crossentropy* (since this a binary classification problem) as the loss function and *accuracy* as the metric to monitor

In [None]:
from keras.models import Sequential
from keras.layers import Dense

# Design the deep neural network [Small + 1 layer]
model = Sequential()
model.add(Dense(256, input_dim=X_train.shape[1], activation="relu"))

# activation = sigmoid for binary classification
model.add(Dense(1, activation = "sigmoid"))

model.compile(optimizer="Adam", loss="binary_crossentropy", metrics=["accuracy"])
history = model.fit(X_train,y_train, validation_data=(X_val,y_val), epochs=3, batch_size=64, verbose=2)

If you closely observe the results from the training output, you will see that the overall accuracy for training as well as validation datasets was around 0.556 (56%), which is identic to our baseline accuracy. 

We can therefore conclude that training this model further might not be a fruitful idea.

Let’s try a deeper network for the same number of neurons. So, we keep everything the same but add one more layer with the same number of 
neurons

In [None]:
# Design the deep neural network [Small + 2 layer]
model = Sequential()
model.add(Dense(256, input_dim=X_train.shape[1], activation="relu"))
model.add(Dense(256, activation="relu"))

# activation = sigmoid for binary classification
model.add(Dense(1, activation = "sigmoid"))

model.compile(optimizer="Adam", loss="binary_crossentropy", metrics=["accuracy"])
history = model.fit(X_train,y_train, validation_data=(X_val,y_val), epochs=3, batch_size=64, verbose=2)

Again, as we can see, the initial results are not at all promising. The training and validation accuracy from the deeper network close to what we would expect. 

Instead of trying another deeper network with, say, three to five layers, let us try training with a bigger (medium-sized) network. 

We shall use a new architecture with just one layer but 512 neurons this time. 

Let us again train for three epochs and have a look at the metrics to check whether it is in line with what we would expect.

In [None]:
# Design the deep neural network [Medium + 1 layer]
model = Sequential()
model.add(Dense(512, input_dim=X_train.shape[1], activation="relu"))

# activation = sigmoid for binary classification
model.add(Dense(1, activation = "sigmoid"))

model.compile(optimizer="Adam", loss="binary_crossentropy", metrics=["accuracy"])
history = model.fit(X_train,y_train, validation_data=(X_val,y_val), epochs=3, batch_size=64, verbose=2)

Let’s now try increasing the depth for the medium-sized network to see if the results improve more.

In [None]:
# Design the deep neural network [Medium + 2 layer]
model = Sequential()
model.add(Dense(512, input_dim=X_train.shape[1], activation="relu"))
model.add(Dense(512, activation="relu"))

# activation = sigmoid for binary classification
model.add(Dense(1, activation = "sigmoid"))

model.compile(optimizer="Adam", loss="binary_crossentropy", metrics=["accuracy"])
history = model.fit(X_train,y_train, validation_data=(X_val,y_val), epochs=3, batch_size=64, verbose=2)

## Revisiting the Data

### Standardize, Normalize, or Scale the Data

In standardization, we transform the data into a form where the mean is 0 and the standard deviation is 1. 

The distribution of the data in this form is a great input candidate for our neuron’s activation function and therefore improves the ability to learn more appropriately

### Transforming the Input Data

To transform the input data for the development of the model, we should only use the training data to fit the scaler transformation and use the same fitted object to transform the validation and test input data

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(with_mean=False)
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

## DNNs for Classification with Improved Data

In [None]:
# Design the deep neural network [Medium + 1 layer]
model = Sequential()
model.add(Dense(512, input_dim=X_train_scaled.shape[1], activation="relu"))

# activation = sigmoid for binary classification
model.add(Dense(1, activation = "sigmoid"))

model.compile(optimizer="Adam", loss="binary_crossentropy", metrics=["accuracy"])
history = model.fit(X_train_scaled,y_train, validation_data=(X_val_scaled,y_val), epochs=3, batch_size=64, verbose=2)

Now, there we go!

We can see the drastic improvement in the performance of the network in providing the standardized datasets. We have an almost 94% accuracy 
on the training and validation datasets. 

Let’s use this model to evaluate the model performance on the test datasets we created earlier

In [None]:
result = model.evaluate(X_test_scaled,y_test)

for i in range(len(model.metrics_names)):
    print("Metric ",model.metrics_names[i],":", str(round(result[i],2)))

We see great results on the test dataset. Let’s try improving the architecture a bit and see. 

We can a medium-sized deeper network to see if the results are better than with the medium-sized network

In [None]:
# Design the deep neural network [Medium + 2 layer]
model = Sequential()
model.add(Dense(512, input_dim=X_train_scaled.shape[1], activation="relu"))
model.add(Dense(512, activation="relu"))

# activation = sigmoid for binary classification
model.add(Dense(1, activation = "sigmoid"))

model.compile(optimizer="Adam", loss="binary_crossentropy", metrics=["accuracy"])
history = model.fit(X_train_scaled,y_train, validation_data=(X_val_scaled,y_val), epochs=3, batch_size=64, verbose=2)

The training and validation accuracy has improved even further to 95%. This small increase with just 3 epochs is awesome. 

We can now be confident of the performance for the model with the architecture. 

We can definitely try many more architectures and check the results, but let’s take a final shot with a larger and deeper network and see the results with 3 
epochs. 

In case we see only small improvements, we will use the same architecture for 25 epochs and use the model for our final predictions.

In [None]:
# Design the deep neural network [Large + 2 layer]
model = Sequential()
model.add(Dense(1024, input_dim=X_train_scaled.shape[1], activation="relu"))
model.add(Dense(1024, activation="relu"))

# activation = sigmoid for binary classification
model.add(Dense(1, activation = "sigmoid"))

model.compile(optimizer="Adam", loss="binary_crossentropy", metrics=["accuracy"])
history = model.fit(X_train_scaled,y_train, validation_data=(X_val_scaled,y_val), epochs=3, batch_size=64, verbose=2)

We see an overall accuracy on the validation dataset as 95% and a similar score for the training dataset. 

So, there really isn’t a lot of improvement in the performance of the model due to increasing the size from a medium (512-neuron) to a larger (1024-neuron) architecture. 

With these results validating our experiments, let’s train a medium-sized (512-neuron) deep network with two layers for 25 epochs, look at the final training and validation accuracy, and then use the trained model to evaluate the test datasets.

In [None]:
# Design the deep neural network [Medium + 2 layer]
model = Sequential()
model.add(Dense(512, input_dim=X_train_scaled.shape[1], activation="relu"))
model.add(Dense(512, activation="relu"))

# activation = sigmoid for binary classification
model.add(Dense(1, activation = "sigmoid"))

model.compile(optimizer="Adam", loss="binary_crossentropy", metrics=["accuracy"])
history = model.fit(X_train_scaled,y_train, validation_data=(X_val_scaled,y_val), epochs=25, batch_size=64, verbose=2)

The final model with a medium-size architecture of 512 neurons and two layers gave great performance results on the training and validation datasets. 

We have an accuracy of ~98% for both datasets. 

Let us now validate the model performance on the test dataset.

In [None]:
result = model.evaluate(X_test_scaled,y_test)

for i in range(len(model.metrics_names)):
    print("Metric ",model.metrics_names[i],":", str(round(result[i],2)))

The performance on the unseen test dataset is also great and consistent. 

Our model is performing really well on the test dataset. 

Let us have a look at the loss curve for the model. 

We will plot the loss in each epoch (25 in total for this mode) for the training and validation datasets

In [None]:
# list all data in history
print(history.history.keys())

In [None]:
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title("Model's Training & Validation loss across epochs")
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()

In [None]:
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title("Model's Training & Validation Accuracy across epochs")
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.legend(['Train', 'Validation'], loc='lower right')
plt.show()


## Using the model

We now have a model, we can use it for submission

### Preprocessing 

In [None]:
act_test = pd.read_csv("/kaggle/input/predicting-red-hat-business-value/act_test.csv.zip", parse_dates=['date'])
# date_ref = np.datetime64("2020-01-01")

act_test_df = act_test[['people_id', 'activity_id', 'date', 'activity_category', 'char_10']]
act_test_df = act_test_df.rename(columns={'date':'activity_date', 'char_10':'activity_type'})
act_test_df.activity_type = act_test_df.activity_type.fillna(act_test_df.activity_type.mode()[0])

act_test_df_new = act_test_df.merge(people, on=["people_id"], how="inner")

act_test_df_new['date_Day'] = (act_test_df_new.date - date_ref).dt.days
act_test_df_new['activity_date_Day'] = (act_test_df_new.activity_date - date_ref).dt.days

# For people ID, we would need to extract values after '_'
act_test_df_new.people_id = act_test_df_new.people_id.apply(lambda x: x.split("_")[1])
act_test_df_new.people_id = pd.to_numeric(act_test_df_new.people_id)

# For activity ID also, we would need to extract values after '_'
act_test_df_new.activity_id = act_test_df_new.activity_id.apply(lambda x: x.split("_")[1])
act_test_df_new.activity_id = pd.to_numeric(act_test_df_new.activity_id)

# For group_1 , we would need to extract values after ' '
act_test_df_new.group_1 = act_test_df_new.group_1.apply(lambda x: x.split(" ")[1])
act_test_df_new.group_1 = pd.to_numeric(act_test_df_new.group_1)

# For activity_type , we would need to extract values after ' '
act_test_df_new.activity_type = act_test_df_new.activity_type.apply(lambda x: x.split(" ")[1])
act_test_df_new.activity_type = pd.to_numeric(act_test_df_new.activity_type)

act_test_X = act_test_df_new[categorical  + numerical].values
act_test_X_tranf = c_tranf.transform(act_test_X).astype(float)

act_test_X_tranf_scaled = scaler.transform(act_test_X_tranf)

### Prediction

In [None]:
prediction = model.predict(act_test_X_tranf_scaled)

sample_submission = pd.read_csv("/kaggle/input/predicting-red-hat-business-value/sample_submission.csv.zip")
sample_submission['outcome'] = prediction.tolist()
sample_submission['outcome'] = sample_submission['outcome'].apply(lambda x: int(x[0]))
sample_submission.to_csv("submission.csv", index=False)
sample_submission

In [None]:
sample_submission.outcome.value_counts()