# Import libraries

In [None]:
# Familiar imports
import numpy as np
import pandas as pd

# For ordinal encoding categorical variables, splitting data
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# For training random forest model
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.feature_selection import mutual_info_regression

# Visualization imports
import matplotlib.pyplot as plt
import seaborn as sns

#Neural Networks
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping

print('Import Finished')

In [None]:
# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

# Load the data

In [None]:
# Product Info
products_df = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv",
                          index_col=0)
products_df.head()

The next code cell separates the target (which we assign to `y`) from the training features (which we assign to `features`).

In [None]:
# Separate target from features
y = train['target']
features = train.drop(['target'], axis=1)

# Preview features
features.head()

# Mutual Information Analysis

In [None]:
# Utility Functions

def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores


def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

In [None]:
df = train.copy()

mi_features = ["cont1","cont2","cont3","cont4"]

sns.relplot(
    x="value", y="target", col="variable", data=df.melt(id_vars="target", value_vars=mi_features), facet_kws=dict(sharex=False),
);

In [None]:
mi_features = ["cont5","cont6","cont7","cont8"]

sns.relplot(
    x="value", y="target", col="variable", data=df.melt(id_vars="target", value_vars=mi_features), facet_kws=dict(sharex=False),
);

In [None]:
mi_features = ["cont9","cont10", "cont11", "cont12"]

sns.relplot(
    x="value", y="target", col="variable", data=df.melt(id_vars="target", value_vars=mi_features), facet_kws=dict(sharex=False),
);

In [None]:
mi_features = ["cat1","cat2", "cat3", "cat4"]

sns.relplot(
    x="value", y="target", col="variable", data=df.melt(id_vars="target", value_vars=mi_features), facet_kws=dict(sharex=False),
);

In [None]:
mi_features = ["cat5","cat6", "cat7", "cat8", "cat9"]

sns.relplot(
    x="value", y="target", col="variable", data=df.melt(id_vars="target", value_vars=mi_features), facet_kws=dict(sharex=False),
);

In [None]:
mi_scores = make_mi_scores(features, y)

print(mi_scores.head(20))
# print(mi_scores.tail(20))  # uncomment to see bottom 20

plt.figure(dpi=100, figsize=(8, 5))
plot_mi_scores(mi_scores.head(20))
# plot_mi_scores(mi_scores.tail(20))  # uncomment to see bottom 20

In [None]:
sns.catplot(x="cat5", y="target", data=train, kind="boxen");

In [None]:
sns.catplot(x="cat8", y="target", data=train, kind="boxen");

In [None]:
sns.catplot(x="cat1", y="target", data=train, kind="boxen");

# Prepare the data


## Checking cardinality of the categorical columns

In [None]:
# List of categorical columns
object_cols = [col for col in features.columns if 'cat' in col]

# Get number of unique entries in each column with categorical data
object_nunique = list(map(lambda col: features[col].nunique(), object_cols))
d = dict(zip(object_cols, object_nunique))

# Print number of unique entries by column, in ascending order
sorted(d.items(), key=lambda x: x[1])

## One Hot Encoding

In [None]:
#One Hot features
OH_features = ['cat6','cat7','cat8','cat9']

X = features.copy()
X_test = test.copy()

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X[OH_features]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_test[OH_features]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X.index
OH_cols_valid.index = X_test.index

# Remove categorical columns (will replace with one-hot encoding)
num_X = X.drop(OH_features, axis=1)
num_X_test = X_test.drop(OH_features, axis=1)

# Add one-hot encoded columns to numerical features
X = pd.concat([num_X, OH_cols_train], axis=1)
X_test = pd.concat([num_X_test, OH_cols_valid], axis=1)

X.head()

## Ordinal Encoder Features

In [None]:
#Ordinal encoder features
OE_features = ['cat0','cat1','cat2','cat3','cat4','cat5']

# ordinal-encode categorical columns
ordinal_encoder = OrdinalEncoder()
X[OE_features] = ordinal_encoder.fit_transform(X[OE_features])
X_test[OE_features] = ordinal_encoder.transform(X_test[OE_features])

# Preview the ordinal-encoded features
X.head()

In [None]:
print(X.shape)

In [None]:
#scaler = StandardScaler()

#X = scaler.fit(X)
#X_test = scaler.fit(X_test)

#X.head()

Next, we break off a validation set from the training data.

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=0)

# Train a model

In [None]:
early_stopping = EarlyStopping(
    min_delta=0.001, # minimium amount of change to count as an improvement
    patience=20, # how many epochs to wait before stopping
    restore_best_weights=True,
)

In [None]:
model = keras.Sequential([
    layers.BatchNormalization(),
    layers.Dense(256, activation='relu', input_shape=[58]),
    layers.Dropout(0.3),
    layers.BatchNormalization(),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.3),
    layers.BatchNormalization(),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.3),
    layers.BatchNormalization(),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.3),
    layers.BatchNormalization(),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.3),
    layers.BatchNormalization(),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.3),
    layers.BatchNormalization(),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.3),
    layers.BatchNormalization(),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.3),
    layers.BatchNormalization(),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.3),
    layers.BatchNormalization(),
    layers.Dense(1),
])

In [None]:
model.compile(
    optimizer='AdaBelief',
    loss='mae',
)

In [None]:
# Reload model from file
#model = keras.models.load_model('my_model.h5')

history = model.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    batch_size=512,
    epochs=1000,
    callbacks=[early_stopping], # put your callbacks in a list
)

history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot();
print("Minimum validation loss: {}".format(history_df['val_loss'].min()))

In [None]:
# Save Model
model.save('my_model.h5')

# Reload model from file
model = keras.models.load_model('my_model.h5')

In [None]:
preds_valid = model.predict(X_valid)
print(mean_squared_error(y_valid, preds_valid, squared=False))

# Submit to the competition

In [None]:
# Use the model to generate predictions
predictions = model.predict(X_test)

# Save the predictions to a CSV file
output = pd.DataFrame({'Id': X_test.index,
                       'target': predictions})
output.to_csv('submission.csv', index=False)