In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split #will be used to generate a train, validation, and test set
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from mlxtend.preprocessing import minmax_scaling
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

np.random.seed(0)

Let's look at our data! We'll be focusing this first test entirely on the data in bottle.csv to simplify things. FYI: Our goal is going to be to try and predict "T_degC" using compositional elements in the water. Basically, anything that is "in" the water is a valid independent variable for the purposes of this test. Other things, such as quality measurements, time indicators, and depth have been dropped (you'll see this happen later).

We're going to be using all of the relevant compositional data from 1949 to 2016 as our training & validation set. I retrieved a set of data from 2017 to 2019 from CalCOFI's website to use as a test set.

In [None]:
X_full = pd.read_csv('../input/calcofi/bottle.csv', low_memory=False)
X_full.head()

In [None]:
X_test_full = pd.read_csv('../input/2017-to-2019-bottle/2017 to 2019 bottle.csv', low_memory=False)
X_test_full.head()

Wow, that's a good amount of data. Let's try to get a sense of how much it is usable.

In [None]:
missing_values_count = X_full.isnull().sum()
total_cells = np.product(X_full.shape)
total_missing = missing_values_count.sum()

percent_missing = (total_missing/total_cells * 100)
print("Total cells:", total_cells)
print("Total cells missing a value:", total_missing)
print(percent_missing)

Yikes -- almost half of our cells are empty!

In [None]:
X_full.dropna(axis=0, subset=['T_degC'], inplace=True)
X_test_full.dropna(axis=0, subset=['T_degC'], inplace=True)

In [None]:
y = X_full.T_degC
y_test = X_test_full.T_degC
#X_full.drop(['T_degC', 'IncTim', 'LightP', 'Depth_ID', 'R_Depth', 'R_TEMP', 'R_POTEMP', 'R_SALINITY', 'R_SIGMA', 'R_SVA', 'R_DYNHT', 'R_O2', 'R_O2Sat', 'R_SIO3', 'R_PO4', 'R_NO3', 'R_NO2', 'R_NH4', 'R_CHLA', 'R_PHAEO', 'R_PRES', 'R_SAMP', 'DIC1', 'DIC2', 'TA1', 'TA2', 'pH2', 'pH1', 'MeanAq', 'MeanAp', 'MeanAs', 'DarkAq', 'DarkAp', 'DarkAs', 'DIC Quality Comment'], axis=1, inplace=True)
X_full.drop(['T_degC'], axis=1, inplace=True) #drop our dependent variable
X_full.drop(['T_prec', 'T_qual', 'R_TEMP', 'R_POTEMP'], axis=1, inplace=True) #drop all temperature-based variables
X_full.drop(['Sta_ID', 'Depth_ID', 'DIC Quality Comment', 'S_prec', 'S_qual', 'P_qual', 'O_qual', 'SThtaq', 'O2Satq', 'Chlqua', 'Phaqua', 'PO4q', 'SiO3qu', 'NO2q', 'NO3q', 'NH3q', 'C14A1p', 'C14A1q', 'C14A2p', 'C14A2q', 'DarkAp', 'DarkAq', 'MeanAp', 'MeanAq'], axis=1, inplace=True) #drop precision and quality metrics
X_full.drop(['R_Depth', 'R_SALINITY', 'R_SIGMA', 'R_SVA', 'R_DYNHT', 'R_O2', 'R_O2Sat', 'R_SIO3', 'R_PO4', 'R_NO3', 'R_NO2', 'R_NH4', 'R_CHLA', 'R_PHAEO', 'R_SAMP'], axis=1, inplace=True) #drop R_ columns
X_full.drop(['IncTim', 'Cst_Cnt', 'Btl_Cnt'], axis=1, inplace=True) #dropping date-time related factors
X_full.drop(['Depthm', 'BtlNum', 'RecInd'], axis=1, inplace=True)
X_full.drop(['R_PRES'], axis=1, inplace=True)

X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, train_size=0.8, test_size=0.2, random_state=0)

In [None]:
categorical_cols = [cname for cname in X_train_full.columns if
                   X_train_full[cname].dtype == 'object']
numerical_cols = [cname for cname in X_train_full.columns if 
                 X_train_full[cname].dtype in ['int64', 'float64']]

my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()
print(X_train.shape)
print(X_test.shape)
X_train.head()

In [None]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='most_frequent')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [None]:
# Define model
n_estimators = [600]
max_depth = [6]
learning_rate = [0.125]

model = XGBRegressor(n_estimators=300, max_depth=5, learning_rate=0.135, random_state=0)

hyperF = dict(n_estimators = n_estimators, max_depth = max_depth, learning_rate = learning_rate)
gridF = GridSearchCV(model, hyperF, cv = 2, verbose = 1, n_jobs = -1)

grid_test = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', gridF)
                           ])

# Preprocessing of training data, fit model 
grid_test.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = grid_test.predict(X_valid)

print('MAE:', mean_absolute_error(y_valid, preds))
print(grid_test.named_steps["model"].best_params_)
#0.081 MAE with learning_rate = 0.125, max_depth = 6, n_estimators = 600

In [None]:
from xgboost import plot_importance

feature_importance = grid_test.named_steps["model"].best_estimator_.get_booster().get_score(importance_type="gain")

plot_importance(feature_importance)
plt.show()
print(len(feature_importance))
print(feature_importance)

In [None]:
results = {}

for col, score in zip(X_train.columns, feature_importance):
    results[col] = score
    
s = pd.Series(results, name='Feature number')
s.index.name = 'Feature name'
s = s.reset_index()

s2 = pd.Series(feature_importance, name='F-score')
s2.index.name = 'Feature number'
s2 = s2.reset_index()

s = pd.DataFrame(s)
s2 = pd.DataFrame(s2)

merged = pd.merge(left=s, right=s2, left_on='Feature number', right_on='Feature number').sort_values(by=['F-score'], ascending=False)
print(merged)

In [None]:
preds_test = grid_test.predict(X_test)
print(y_test)
print(preds_test)

print('MAE for 2017 to 2019 test:', mean_absolute_error(y_test, preds_test))