In [None]:
#Feature Importance .
"""
Exploratory data analysis is an approach to better understand data to find any patterns and this data is related to a certain context and without context data has no importance/significance. Next step is to understand the dataset variables weight (importance). We can do that by calculating the feature Importance or significance. This process of finding the significant or important variables in datasets is a pre-processing step, before fitting our data into any model that we think is appropriate to solve the problem at hand.
Because, if we use the important features of the dataset, we can reduce the
• Reducing the training time
• cost of computation
• It can help us in not overfitting the model
My experiment with the Abalone data to classify their ages as >11 years or <= 11, has shown me that Length and Diameter have no significance in classifying them into different age groups. The interesting part is there is no loss of accuracy in this case. Sometimes there might be a loss accuracy but we have to always consider the tradeoff, whether to include or exclude
"""



In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score



In [None]:
abalone = pd.read_csv('../input/abalone.data.csv',header=None)
abalone.columns = ['Sex','Length','Diameter','Height','Whole_weight','Shucked_weight','Viscera_weight','Shell_weight','Rings']
print(abalone.head())

In [None]:
#EDA
abalone.columns

In [None]:
abalone.info()

In [None]:
#abalone = [abalone['Rings'].apply(lambda x:if(abalone['Rings'] >=11))]

def class_group(x):
    if ((x < 11) | (x == 11)):
        return 0
    else:
        return 1

    

In [None]:
abalone['class'] = abalone['Rings'].apply(lambda x:class_group(x))

In [None]:
abalone.head(2)

In [None]:
abalone.describe()

In [None]:
abalone['Rings'].value_counts()


In [None]:
plt.figure(figsize=(12,10))
abalone[abalone['Sex']=='M']['Rings'].hist(alpha=0.5,color='blue',
                                              bins=30,label='Sex=Male')
abalone[abalone['Sex']=='F']['Rings'].hist(alpha=0.5,color='red',
                                             bins=30,label='Sex=Female')
abalone[abalone['Sex']=='I']['Rings'].hist(alpha=0.5,color='green',
                                             bins=30,label='Sex=Infant')
plt.legend()
plt.xlabel('Rings')

In [None]:
#sns.pairplot(abalone.drop(['Sex'],axis=1))
df_corr = abalone.corr() # Calculation of the correlation coefficients in pairs, with the default method:
                    # Pearson, Standard Correlation Coefficient
    


In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(df_corr, cmap="YlGnBu") # Displaying the Heatmap
sns.set(font_scale=2,style='white')

plt.title('Heatmap correlation')
plt.show()

In [None]:
X=abalone[['Length','Diameter','Height','Whole_weight','Shucked_weight','Viscera_weight','Shell_weight']]
y=abalone['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train,y_train)
y_test_pred_rfc = rfc.predict(X_test)
accuracy_score(y_test, y_test_pred_rfc)

In [None]:
print('Classification report:')
print(classification_report(y_test,y_test_pred_rfc))
print('Confusion matrix:')
print(confusion_matrix(y_test,y_test_pred_rfc))

In [None]:
plt.figure(figsize=(10,5))
cm = np.array(confusion_matrix(y_test,y_test_pred_rfc))
sns.heatmap(cm, cmap="YlGnBu") # Displaying the Heatmap
sns.set(font_scale=2,style='white')
plt.title('Confusion Matrix')
plt.show()

**Lets build the model with new Important feature/variables.
Based on the feature importance calcuation below,:
Length has 0 weightage  and Diameter is too is similar to Length.So lets build a new model considering feature importance and see if the Model performance chages.**

In [None]:
#https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html#sphx-glr-auto-examples-ensemble-plot-forest-importances-py

importances = rfc.feature_importances_
print(importances)
print(type(importances))
print(type(X.shape))
print(X.shape[1])
std = np.std([tree.feature_importances_ for tree in rfc.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("{}.feature {} ({})".format(f + 1, indices[f], importances[indices[f]]))
    #print("%d. feature %d (%f)"% (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature/Variable Selection")
plt.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()



In [None]:
Xfs=abalone[['Height','Whole_weight','Shucked_weight','Viscera_weight','Shell_weight']]
yfs=abalone['class']
rfc = RandomForestClassifier(n_estimators=100)
Xfs_train, Xfs_test, yfs_train, yfs_test = train_test_split(Xfs, yfs, test_size=0.30, random_state=101)
rfc.fit(Xfs_train,yfs_train)
yfs_test_pred_rfc = rfc.predict(Xfs_test)
accuracy_score(yfs_test, yfs_test_pred_rfc)

In [None]:
print('FeatureImporatance Random Forest Classification report:')
print(classification_report(yfs_test,yfs_test_pred_rfc))
print('FeatureImporatance Random Forest Confusion matrix:')
print(confusion_matrix(yfs_test,yfs_test_pred_rfc))

In [None]:
plt.figure(figsize=(10,5))
cm_fs = np.array(confusion_matrix(yfs_test,yfs_test_pred_rfc))
sns.heatmap(cm, cmap="YlGnBu") # Displaying the Heatmap
sns.set(font_scale=2,style='white')
plt.title('Confusion Matrix')
plt.show()