In [17]:
import pandas as pd
pd.set_option("display.max_columns",500)
pd.set_option("display.max_rows",1000)
import os
import logging
import seaborn as sns
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn import linear_model
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder, QuantileTransformer, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor, make_column_transformer
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from scipy import stats
import category_encoders as ce
sns.set_style("darkgrid")
%matplotlib inline

In [18]:
logger = logging.getLogger('crab_analysis')
logger.setLevel(logging.DEBUG)
logger.handlers = []
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)

In [19]:
columns = ["length", "diameter", "height", "weight", "shucked_weight", "viscera_weight", "shell_weight",
                        "age", "sex"]
continuous_var_columns = ["length", "diameter", "height", "weight", "shucked_weight", "viscera_weight",
                                       "shell_weight", "age"]


In [20]:
raw_df = pd.read_csv("crab_data.csv")
raw_df.fillna(raw_df.mean(), inplace=True)
raw_df_cont = raw_df[continuous_var_columns]
x = raw_df_cont[~(np.abs(stats.zscore(raw_df_cont)) < 3).all(axis=1)]
crab_df_woo = raw_df.drop(x.index)

In [21]:
def reverse_ohe(row):
    """
    reverse codes onehotencoding required for OLS

    :param row:
    :return:
    """
    if row["sex_F"] == 1:
        return 'F'
    if row["sex_M"] == 1:
        return 'M'
    if row["sex_I"] == 1:
        return 'I'
    # this should never happen
    return 'na'

In [29]:
transformer = QuantileTransformer(output_distribution='normal')
# since I observed that the data was skewed, I decided to transform the continuous variables to normal dist
reg = linear_model.LinearRegression()
t_reg = TransformedTargetRegressor(regressor=reg,transformer=transformer)
ohe = ce.OneHotEncoder(handle_unknown='ignore', use_cat_names=True, drop_invariant=True)
crab_df_woo_enc = ohe.fit_transform(crab_df_woo)
X = crab_df_woo_enc.drop("age",axis=1)
y = crab_df_woo_enc[["age"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)
t_reg.fit(X_train, y_train)
s1 = t_reg.score(X_test, y_test)
y_pred = t_reg.predict(X)
crab_df = X.copy()
crab_df["age"] = pd.Series(y.values.ravel())
crab_df["age_ols"] = pd.Series(y_pred.ravel())
crab_df['sex'] = crab_df.apply (lambda row: reverse_ohe(row), axis=1)
crab_df.drop(["sex_I","sex_M","sex_F"], axis=1, inplace=True)
mse = np.sqrt(mean_squared_error(y,y_pred))
mae = mean_absolute_error(y,y_pred)
logger.debug("MAE: {0}".format(mae))
logger.debug("RMSE: {0}".format(mse))
logger.debug("R-squared: {0}".format(s1))


2019-12-04 22:24:19,052 - crab_analysis - DEBUG - MAE: 1.4528987918744949
2019-12-04 22:24:19,058 - crab_analysis - DEBUG - RMSE: 2.0374213086105413
2019-12-04 22:24:19,058 - crab_analysis - DEBUG - R-squared: 0.4600900592575342


In [30]:
X = raw_df.drop("age",axis=1)
y = raw_df[["age"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)
numerical_features = X_train.dtypes == 'float'
categorical_features = ~numerical_features
preprocess = make_column_transformer(
                                    (RobustScaler(), numerical_features),
                                    (OneHotEncoder(sparse=False), categorical_features)
)
forest = RandomForestRegressor(n_estimators=5000, max_depth=20, min_samples_leaf=2,min_samples_split=4,random_state=100)
f_reg = Pipeline(steps=[('preprocess', preprocess),('model', forest)])
f_reg_ttr = TransformedTargetRegressor(regressor=f_reg)
f_reg_ttr.fit(X_train,y_train)
s = f_reg_ttr.score(X_test,y_test)
y_pred = f_reg_ttr.predict(X)
#recreate the original dataset
crab_df = X.copy()
crab_df["age"] = pd.Series(y.values.ravel())
crab_df["age_forest"] = pd.Series(y_pred.ravel())
mse = np.sqrt(mean_squared_error(y,y_pred))
mae = mean_absolute_error(y,y_pred)
logger.debug("MAE: {0}".format(mae))
logger.debug("RMSE: {0}".format(mse))
logger.debug("R-squared: {0}".format(s))


2019-12-04 22:25:37,166 - crab_analysis - DEBUG - MAE: 0.8578217836058949
2019-12-04 22:25:37,167 - crab_analysis - DEBUG - RMSE: 1.3054290811288343
2019-12-04 22:25:37,168 - crab_analysis - DEBUG - R-squared: 0.5495520636895386
