# Regression with an Abalone Dataset
Input files - [Downloaded here](https://www.kaggle.com/competitions/playground-series-s4e4/data)

Original dataset - [here](https://archive.ics.uci.edu/dataset/1/abalone)
## Development Notes/Ideas
-  Example of a good notebook from a similar regression competition: https://www.kaggle.com/code/oscarm524/ps-s3-ep16-eda-modeling-submission/notebook
- Can incorporate original data with train data, and flag as original data
## Libraries

In [15]:
### libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import boxcox
from sklearn.preprocessing import RobustScaler
from sklearn.impute import KNNImputer

## Load and Preview Data
### Data Description
**Name / Data Type / Measurement Unit / Description**
- Sex / nominal / -- / M, F, and I (infant)
- Length / continuous / mm / Longest shell measurement
- Diameter	/ continuous / mm / perpendicular to length
- Height / continuous / mm / with meat in shell
- Whole weight / continuous / grams / whole abalone
- Shucked weight / continuous	 / grams / weight of meat
- Viscera weight / continuous / grams / gut weight (after bleeding)
- Shell weight / continuous / grams / after being dried
- Rings / integer / -- / +1.5 gives the age in years

In [2]:
### load data
train_raw = pd.read_csv('Data_Download/train.csv')
test_raw=pd.read_csv('Data_Download/test.csv')

### data info
train_raw.info()
print("\n")
test_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90615 entries, 0 to 90614
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              90615 non-null  int64  
 1   Sex             90615 non-null  object 
 2   Length          90615 non-null  float64
 3   Diameter        90615 non-null  float64
 4   Height          90615 non-null  float64
 5   Whole weight    90615 non-null  float64
 6   Whole weight.1  90615 non-null  float64
 7   Whole weight.2  90615 non-null  float64
 8   Shell weight    90615 non-null  float64
 9   Rings           90615 non-null  int64  
dtypes: float64(7), int64(2), object(1)
memory usage: 6.9+ MB


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60411 entries, 0 to 60410
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              60411 non-null  int64  
 1   Sex             60411 non-null  object 

In [3]:
### preview data
train_raw.head(5)

Unnamed: 0,id,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
0,0,F,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,11
1,1,F,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,11
2,2,I,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,6
3,3,M,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25,10
4,4,I,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975,9


In [4]:
### summarise data 
train_raw.describe(include='all')

Unnamed: 0,id,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
count,90615.0,90615,90615.0,90615.0,90615.0,90615.0,90615.0,90615.0,90615.0,90615.0
unique,,3,,,,,,,,
top,,I,,,,,,,,
freq,,33093,,,,,,,,
mean,45307.0,,0.517098,0.401679,0.135464,0.789035,0.340778,0.169422,0.225898,9.696794
std,26158.441658,,0.118217,0.098026,0.038008,0.457671,0.204428,0.100909,0.130203,3.176221
min,0.0,,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0
25%,22653.5,,0.445,0.345,0.11,0.419,0.1775,0.0865,0.12,8.0
50%,45307.0,,0.545,0.425,0.14,0.7995,0.33,0.166,0.225,9.0
75%,67960.5,,0.6,0.47,0.16,1.0675,0.463,0.2325,0.305,11.0


## Pre-processing

In [5]:
## create new tables for manipulation
train_clean = train_raw.drop('id',axis=1)
test_clean = test_raw.drop('id',axis=1)

### Fill NULLS

In [6]:
## identify columns with NaN -- no NaNs
print('train: ', train_clean.columns[train_clean.isna().any()])
print('test: ', test_clean.columns[test_clean.isna().any()])

train:  Index([], dtype='object')
test:  Index([], dtype='object')


In [7]:
## height=0
train_clean[train_clean.Height==0]

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
7025,I,0.315,0.23,0.0,0.134,0.0545,0.0245,0.3505,8
14208,I,0.43,0.34,0.0,0.344,0.188,0.0705,0.105,8
32078,M,0.47,0.355,0.0,0.5785,0.2965,0.155,0.136,7
52324,I,0.315,0.23,0.0,0.134,0.053,0.0375,0.3505,7
70295,I,0.43,0.34,0.0,0.406,0.2035,0.096,0.123,7
90295,M,0.43,0.325,0.0,0.3335,0.135,0.0585,0.105,10


In [8]:
## for height=0, use KNN imputer as height is highly correlated with other variables (seen in EDA)
imp_knn = KNNImputer(missing_values=0, n_neighbors=10, add_indicator=True).set_output(transform='pandas')
train_imp = train_clean.drop(['Rings','Sex'],axis=1)
train_imp = imp_knn.fit_transform(train_imp)

test_imp = test_clean.drop(['Sex'],axis=1)
test_imp = imp_knn.transform(test_imp)

## combine all back in to a single train/test tables
train_imp = pd.concat([train_imp, train_clean[['Rings','Sex']]], axis=1)
train_clean = train_imp[train_clean.columns].copy()

test_imp = pd.concat([test_imp, train_clean[['Sex']]], axis=1)
test_clean = test_imp[test_clean.columns].copy()

In [9]:
## view results of imputation
train_imp[train_imp.missingindicator_Height==1]

Unnamed: 0,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,missingindicator_Height,Rings,Sex
7025,0.315,0.23,0.084,0.134,0.0545,0.0245,0.3505,1.0,8,I
14208,0.43,0.34,0.0975,0.344,0.188,0.0705,0.105,1.0,8,I
32078,0.47,0.355,0.123,0.5785,0.2965,0.155,0.136,1.0,7,M
52324,0.315,0.23,0.084,0.134,0.053,0.0375,0.3505,1.0,7,I
70295,0.43,0.34,0.1105,0.406,0.2035,0.096,0.123,1.0,7,I
90295,0.43,0.325,0.101,0.3335,0.135,0.0585,0.105,1.0,10,M


### Fix Column DataTypes

In [10]:
## convert object columns to category
for col in train_clean.columns[train_clean.dtypes==object]:
    train_clean[col] = train_clean[col].astype('category')

for col in test_clean.columns[test_clean.dtypes==object]:
    test_clean[col] = test_clean[col].astype('category')

## display categories of category columns
for col in train_clean.columns[train_clean.dtypes=='category']:
    print(col, ': \t', train_clean[col].cat.categories)

Sex : 	 Index(['F', 'I', 'M'], dtype='object')


### Feature Engineering
- [x] Volume cube = Length * Diameter * Height
- [ ] Water weight = whole weight - whole weight 1 - whole weight 2 - shell weight

In [11]:
train_clean['Volume_cube'] = train_clean['Length']*train_clean['Diameter']*train_clean['Height']
test_clean['Volume_cube'] = test_clean['Length']*test_clean['Diameter']*test_clean['Height']

### Unskew Variables

In [12]:
## calculate skew of numeric variables
skew_var = train_clean.skew(numeric_only=True).sort_values(ascending=False)
skew_var_test = test_clean.skew(numeric_only=True).sort_values(ascending=False)

## apply boxcox algorithm to optimise normalisation/removal of skew
## except Rings - force to use log transformation - easier to transform back later
train_clean_boxcox  = pd.DataFrame()
test_clean_boxcox  = pd.DataFrame()
boxcox_lmbdas = {}

for var in skew_var.keys():
    if var != 'Rings':
        train_clean_boxcox[var],lmbda = boxcox(train_clean[var]+1)
        boxcox_lmbdas[var] = lmbda
        test_clean_boxcox[var] = boxcox(test_clean[var]+1, lmbda=lmbda)
    else:
        train_clean_boxcox[var] = np.log(train_clean[var])
        boxcox_lmbdas[var] = 0    
skew_var_boxcox = train_clean_boxcox.skew(numeric_only=True).sort_values(ascending=False)
skew_var_boxcox_test = test_clean_boxcox.skew(numeric_only=True).sort_values(ascending=False)

# compare skewness before and after boxcox (0 means no skew)
pd.DataFrame({'Vars':skew_var.keys(), 'Raw':skew_var.values, 'Transformed':skew_var_boxcox.values})

Unnamed: 0,Vars,Raw,Transformed
0,Rings,1.204273,0.041034
1,Whole weight.1,0.591973,0.030645
2,Volume_cube,0.520651,0.026793
3,Shell weight,0.479092,0.017243
4,Whole weight.2,0.476733,0.01712
5,Whole weight,0.429316,-0.011446
6,Height,0.312737,-0.077509
7,Diameter,-0.695236,-0.078338
8,Length,-0.732015,-0.197299


In [13]:
## Replace numeric columnns with unskewed transformation
for var in train_clean_boxcox.columns.to_list():
    train_clean[var] = train_clean_boxcox[var]
    if var != 'Rings':
        test_clean[var] = test_clean_boxcox[var]

### Scaling Variables

In [17]:
## Scaling all numerical predictor variables  - already unskewed
cols_num = train_clean.select_dtypes(include='number').columns.to_list()
cols_num.remove('Rings')

scaler = RobustScaler()
train_clean_scaled = scaler.fit_transform(train_clean[cols_num])
train_clean_scaled = pd.DataFrame(train_clean_scaled, columns=cols_num)
test_clean_scaled = scaler.transform(test_clean[cols_num])
test_clean_scaled = pd.DataFrame(test_clean_scaled, columns=cols_num)

## Replace numerical columnns with scaled version
for var in train_clean_scaled.columns.to_list():
    train_clean[var] = train_clean_scaled[var]
    test_clean[var] = test_clean_scaled[var]

## Save data

In [14]:
train_clean.to_pickle('train_clean.pkl')
test_clean.to_pickle('test_clean.pkl')