In [5]:
# Global imports
import numpy as np
import pandas as pd
from scipy import stats
from scipy.stats import norm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
# User defined files
from split_dataset import split_dataset
from model_accuracy import model_accuracy
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [6]:
# Start dataset analysis
abalone = pd.read_csv('abalone.csv')

In [7]:
# Clean data that has 0.0 height and the outliers that may cause skewness
abalone = abalone[abalone.Height > 0]
abalone = abalone[abalone.Height < 0.4]
abalone.columns=['Sex','Length','Diameter','Height','Whole weight', 'Shucked weight','Viscera weight','Shell weight','Rings']
abalone.sample(5)

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
2717,I,0.345,0.255,0.095,0.183,0.075,0.0385,0.06,6
3507,F,0.64,0.5,0.165,1.1635,0.554,0.239,0.32,11
225,F,0.5,0.37,0.135,0.45,0.1715,0.1055,0.155,9
465,I,0.175,0.125,0.05,0.0235,0.008,0.0035,0.008,5
2925,I,0.605,0.48,0.155,0.9995,0.425,0.1985,0.3,10


In [8]:
# Check dataset skewness
nf = abalone.select_dtypes(include=[np.number]).columns
cf = abalone.select_dtypes(include=[np.object]).columns
skew_list = stats.skew(abalone[nf])
skew_list_df = pd.concat([pd.DataFrame(nf,columns=['Features']),pd.DataFrame(skew_list,columns=['Skewness'])],axis = 1)
skew_list_df.sort_values(by='Skewness', ascending = False)

Unnamed: 0,Features,Skewness
7,Rings,1.112866
4,Shucked weight,0.714381
6,Shell weight,0.620855
5,Viscera weight,0.589246
3,Whole weight,0.528131
2,Height,-0.249226
1,Diameter,-0.610588
0,Length,-0.641314
