In [2]:
#Authot: Sateesh K
#Date  : 10/May/2020
#Purpose: Build an ML model with Random forest of  trees by using Abalone gender predictions.


In [3]:
#House keeping
import pandas as pd
df = pd.read_csv('abalone.csv')
df.describe()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
count,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0
mean,0.523992,0.407881,0.139516,0.828742,0.359367,0.180594,0.238831,9.933684
std,0.120093,0.09924,0.041827,0.490389,0.221963,0.109614,0.139203,3.224169
min,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0
25%,0.45,0.35,0.115,0.4415,0.186,0.0935,0.13,8.0
50%,0.545,0.425,0.14,0.7995,0.336,0.171,0.234,9.0
75%,0.615,0.48,0.165,1.153,0.502,0.253,0.329,11.0
max,0.815,0.65,1.13,2.8255,1.488,0.76,1.005,29.0


In [4]:
#From previous analysis of data,it is clear the data is clean
df.shape

(4177, 9)

In [5]:
#Remove all rows whose Sex value is I, this is because I represents Infant and this can neither be used for training or testing.
df_no_infants = df[df.Sex != 'I']

In [6]:
#There is only one categorical variable, do a one-hot encoding.
#Select the variables to be one-hot encoded
one_hot_features = ['Sex']
# Convert categorical variables into dummy/indicator variables (i.e. one-hot encoding).
one_hot_encoded = pd.get_dummies(df_no_infants[one_hot_features],drop_first=True)
one_hot_encoded.info(verbose=True, memory_usage=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2835 entries, 0 to 4176
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Sex_M   2835 non-null   uint8
dtypes: uint8(1)
memory usage: 24.9 KB


In [7]:
# Replacing categorical columns with dummies
df_conv = df_no_infants.drop(one_hot_features,axis=1)
df_conv = pd.concat([df_no_infants, one_hot_encoded] ,axis=1)
df_conv.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Sex_M
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,1
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,1
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,0
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,1
6,F,0.53,0.415,0.15,0.7775,0.237,0.1415,0.33,20,0


In [8]:
#Divide the data into X, input variables and Y the output variable.
X = df_conv.drop('Sex',axis=1)
y= df_conv['Sex']

X = X.drop('Sex_M',axis=1)
X.head()



Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
6,0.53,0.415,0.15,0.7775,0.237,0.1415,0.33,20


In [59]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)

# Scale and center the data
fdf_normalized = scaler.transform(X)
print(fdf_normalized)

# Create a pandas DataFrame of independent variables, using fdf_normalized has normalized all the values.
fdf_normalized = pd.DataFrame(data=fdf_normalized, index=X.index, columns=X.columns)

#Split the data into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(fdf_normalized, y, test_size=0.67,random_state=101)



[[-1.19569382 -1.03261325 -1.58860558 ... -1.19304302 -1.09623511
   1.33577877]
 [-2.29168359 -2.30106448 -1.72224247 ... -1.70844397 -1.71729425
  -1.27117954]
 [-0.41284398 -0.33496508 -0.51951046 ... -0.79544801 -0.63044075
  -0.61943996]
 ...
 [ 0.31781586  0.36268309  1.35140599 ...  0.63785747  0.1303567
  -0.61943996]
 [ 0.57876581  0.48952821 -0.11859979 ...  0.37770271  0.03719783
  -0.29357017]
 [ 1.46599562  1.37744407  1.08413222 ...  1.51158479  1.58208245
   0.3581694 ]]


In [60]:
#Build the Random Forest of Trees model
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=40, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=18,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
rf_clf.fit(X_train, y_train)                                          # Fitting the model



RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=40, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=18,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [61]:
y_pred = rf_clf.predict(X_test)
#Just print the first five elements.
y_pred[0:5]


array(['F', 'F', 'M', 'F', 'F'], dtype=object)

In [62]:
# comparing the metrics of predicted lebel and real label of test data
from sklearn import metrics
print("Train Accuracy for test data: ", metrics.accuracy_score(y_test, y_pred))


Train Accuracy for test data:  0.533157894736842


In [63]:
#Print the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)  
print(cm)

[[372 517]
 [370 641]]


In [64]:
#Check accuracy with training data, meaning do a prediction with training data as inputs using Random forest
y_pred = rf_clf.predict(X_train)
#Now y_pred and y_train should be exactly identical
from sklearn import metrics
print("Train Accuracy for training data: ", metrics.accuracy_score(y_train, y_pred))

#Print the confusion matrix for training data used as testing data
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_train, y_pred)  
print(cm)

Train Accuracy for training data:  0.9775401069518717
[[401  17]
 [  4 513]]


In [83]:
#Now on to investigations on why the model performs so well with training data set (98%) and poorly with test data-set (52%)
X_train.describe()

#Visuzalize the X_train data-set
import pandas_profiling
report = pandas_profiling.ProfileReport(X_train)
report.to_file('X_train.html')

TypeError: concat() got an unexpected keyword argument 'join_axes'

In [82]:
X_test.describe()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
count,1900.0,1900.0,1900.0,1900.0,1900.0,1900.0,1900.0,1900.0
mean,0.004264,0.001408,0.004135,0.001904,0.00437,-0.002593,0.005591,-0.006976
std,1.000224,1.003882,1.049906,0.998866,0.998502,0.995732,1.009262,0.983779
min,-4.327093,-4.267164,-3.726796,-2.209813,-2.038835,-2.155125,-2.221905,-2.574659
25%,-0.569414,-0.588655,-0.51951,-0.668614,-0.720229,-0.687459,-0.669257,-0.61944
50%,0.161246,0.172415,0.015037,-0.033321,-0.023807,-0.039527,-0.048198,-0.29357
75%,0.683146,0.679796,0.549585,0.600868,0.606626,0.624359,0.600033,0.358169
max,2.561985,2.582473,26.074231,3.889881,4.944238,5.276466,5.541334,5.897956
