# Load Data

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline

In [None]:
bottle = pd.read_csv("/kaggle/input/calcofi/bottle.csv")
bottle = bottle[["Depthm", "T_degC", "O2ml_L", "Salnty"]]
bottle.columns = ["Depth", "Temp", "O2 Level", "Salinity"]

# Exploring the data

In [None]:
bottle.info()

In [None]:
bottle.describe()

In [None]:
sns.heatmap(bottle.corr(), cmap = "coolwarm")

In [None]:
sns.pairplot(bottle)

In [None]:
sns.distplot(bottle["Depth"])


# Create Test Set

In [None]:
from sklearn.model_selection import train_test_split

bottle_train, bottle_test = train_test_split(bottle, test_size = 0.2, random_state = 42)



# Data Cleaning

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class NaRemover(BaseEstimator, TransformerMixin):
    def __init__(self): # no *args or **kargs
        pass
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X):
        return X.dropna()


In [None]:
from sklearn.pipeline import Pipeline

full_pipeline = Pipeline([
    ('remove_na', NaRemover())
])


In [None]:
bottle_prepared = full_pipeline.fit_transform(bottle_train)
bottle_labels = bottle_prepared["O2 Level"].copy()

bottle_prepared = bottle_prepared.drop("O2 Level", axis=1)

In [None]:
from sklearn.preprocessing import StandardScaler 
scaler = StandardScaler()
scaler.fit_transform(bottle_prepared)

In [None]:
pd.DataFrame(bottle_prepared).info()
pd.DataFrame(bottle_labels).info()

# Training

### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(lin_reg, bottle_prepared, bottle_labels, scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-scores)

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(lin_rmse_scores)



In [None]:
lin_reg.fit(bottle_prepared, bottle_labels)

### Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()

In [None]:
scores = cross_val_score(tree_reg, bottle_prepared, bottle_labels, scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(tree_rmse_scores)

In [None]:
tree_reg.fit(bottle_prepared, bottle_labels)

# Final Result

In [None]:
bottle_test = full_pipeline.fit_transform(bottle_test)
bottle_test_labels = bottle_test["O2 Level"].copy()

bottle_test = bottle_test.drop("O2 Level", axis=1)

In [None]:
final_predictions = tree_reg.predict(bottle_test)
final_mse = mean_squared_error(final_predictions, bottle_test_labels)
final_rmse = np.sqrt(final_mse)
print(final_rmse)


In [None]:
from scipy import stats
confidence = 0.95
squared_errors = (final_predictions - bottle_test_labels) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1, loc=squared_errors.mean(), scale=stats.sem(squared_errors)))