In [1]:
#Authot: Sateesh K
#Date  : 10/May/2020
#Purpose: Build an ML model with Random forest of  trees by using Abalone gender predictions.


In [2]:
#House keeping
import pandas as pd
df = pd.read_csv('abalone.csv')
df.describe()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
count,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0
mean,0.523992,0.407881,0.139516,0.828742,0.359367,0.180594,0.238831,9.933684
std,0.120093,0.09924,0.041827,0.490389,0.221963,0.109614,0.139203,3.224169
min,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0
25%,0.45,0.35,0.115,0.4415,0.186,0.0935,0.13,8.0
50%,0.545,0.425,0.14,0.7995,0.336,0.171,0.234,9.0
75%,0.615,0.48,0.165,1.153,0.502,0.253,0.329,11.0
max,0.815,0.65,1.13,2.8255,1.488,0.76,1.005,29.0


In [3]:
#From previous analysis of data,it is clear the data is clean
df.shape

(4177, 9)

In [4]:
#Remove all rows whose Sex value is I, this is because I represents Infant and this can neither be used for training or testing.
df_no_infants = df[df.Sex != 'I']

In [5]:
#There is only one categorical variable, do a one-hot encoding.
#Select the variables to be one-hot encoded
one_hot_features = ['Sex']
# Convert categorical variables into dummy/indicator variables (i.e. one-hot encoding).
one_hot_encoded = pd.get_dummies(df_no_infants[one_hot_features],drop_first=True)
one_hot_encoded.info(verbose=True, memory_usage=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2835 entries, 0 to 4176
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Sex_M   2835 non-null   uint8
dtypes: uint8(1)
memory usage: 24.9 KB


In [6]:
# Replacing categorical columns with dummies
df_conv = df_no_infants.drop(one_hot_features,axis=1)
df_conv = pd.concat([df_no_infants, one_hot_encoded] ,axis=1)
df_conv.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Sex_M
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,1
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,1
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,0
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,1
6,F,0.53,0.415,0.15,0.7775,0.237,0.1415,0.33,20,0


In [7]:
#Divide the data into X, input variables and Y the output variable.
X = df_conv.drop('Sex',axis=1)
y= df_conv['Sex']

X = X.drop('Sex_M',axis=1)
X.head()



Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
6,0.53,0.415,0.15,0.7775,0.237,0.1415,0.33,20


In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)

# Scale and center the data
fdf_normalized = scaler.transform(X)
#print(fdf_normalized)

# Create a pandas DataFrame of independent variables, using fdf_normalized has normalized all the values.
fdf_normalized = pd.DataFrame(data=fdf_normalized, index=X.index, columns=X.columns)

#Split the data into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(fdf_normalized, y, test_size=0.23,random_state=1234)
print(type(X_train))
print(type(fdf_normalized))


<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [9]:
#Build the Random Forest of Trees model
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=40, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=18,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [10]:
#Do a cross validation before fitting the model within training data-set. The test data remains the same, so X_test and y_test 
# are not converted into folds.
#So training data is split into K-folds. So the training data itself is split into folded training data and folded test data
import numpy as np
from sklearn.model_selection import KFold
from sklearn import metrics

for i in range(5):
#K-fold the X values
    kf_x = KFold(n_splits=5,random_state=1234,shuffle=True)
    result = next(kf_x.split(X_train), None)
    X_train_folded = X_train.iloc[result[0]]
#K-fold the y values
    kf_y = KFold(n_splits=5,random_state=1234,shuffle=True)
    result = next(kf_y.split(y_train), None)
    y_train_folded = y_train.iloc[result[0]]
    #print(X_train_folded)
    #print(y_train_folded)
    rf_clf.fit(X_train_folded, y_train_folded)
    y_pred = rf_clf.predict(X_test)
    print("Train Accuracy for test data: ",i, metrics.accuracy_score(y_test, y_pred))
    


Train Accuracy for test data:  0 0.554364471669219
Train Accuracy for test data:  1 0.5176110260336907
Train Accuracy for test data:  2 0.5237366003062787
Train Accuracy for test data:  3 0.5650842266462481
Train Accuracy for test data:  4 0.5237366003062787


In [11]:
y_pred = rf_clf.predict(X_test)
#Just print the first five elements.
y_pred[0:5]


array(['M', 'F', 'F', 'F', 'M'], dtype=object)

In [12]:
# comparing the metrics of predicted lebel and real label of test data
from sklearn import metrics
print("Train Accuracy for test data: ", metrics.accuracy_score(y_test, y_pred))


Train Accuracy for test data:  0.5237366003062787


In [13]:
#Print the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)  
print(cm)

[[139 158]
 [153 203]]


In [14]:
#Check accuracy with training data, meaning do a prediction with training data as inputs using Random forest
y_pred = rf_clf.predict(X_train)
#Now y_pred and y_train should be exactly identical
from sklearn import metrics
print("Train Accuracy for training data: ", metrics.accuracy_score(y_train, y_pred))

#Print the confusion matrix for training data used as testing data
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_train, y_pred)  
print(cm)

Train Accuracy for training data:  0.8918423464711274
[[ 902  108]
 [ 128 1044]]


In [15]:
#Now on to investigations on why the model performs so well with training data set (98%) and poorly with test data-set (52%)
X_train.describe()

#Visuzalize the X_train data-set
#import pandas_profiling
#report = pandas_profiling.ProfileReport(X_train)
#report.to_file('X_train.html')

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
count,2182.0,2182.0,2182.0,2182.0,2182.0,2182.0,2182.0,2182.0
mean,0.021323,0.02534,0.017977,0.018925,0.014403,0.015478,0.021421,0.008554
std,0.990841,0.990363,1.036542,1.000307,1.000068,0.995329,1.001857,1.003814
min,-4.327093,-4.267164,-3.726796,-2.209813,-2.038835,-2.155125,-2.221905,-2.574659
25%,-0.517224,-0.525233,-0.51951,-0.672476,-0.690769,-0.662916,-0.669257,-0.61944
50%,0.161246,0.172415,0.015037,-0.008495,-0.02145,-0.041981,-0.022967,-0.29357
75%,0.683146,0.679796,0.549585,0.629005,0.613696,0.656265,0.611678,0.358169
max,2.561985,2.582473,26.074231,3.991395,4.944238,5.276466,5.541334,5.897956


In [16]:
X_test.describe()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
count,653.0,653.0,653.0,653.0,653.0,653.0,653.0,653.0
mean,-0.071251,-0.084674,-0.06007,-0.063236,-0.048129,-0.051719,-0.071577,-0.028583
std,1.028328,1.028544,0.865698,0.997901,0.999798,1.015264,0.991951,0.988149
min,-4.327093,-4.203741,-3.459522,-2.191055,-2.027051,-2.13549,-2.202497,-2.248789
25%,-0.673794,-0.652078,-0.51951,-0.787508,-0.773256,-0.770905,-0.746889,-0.61944
50%,0.056866,0.04557,0.015037,-0.089044,-0.096866,-0.118064,-0.110304,-0.29357
75%,0.630956,0.679796,0.549585,0.523353,0.603091,0.569137,0.495229,0.358169
max,2.144465,2.01167,2.286864,3.394446,3.801211,3.352302,4.609746,3.942737
