# **Data Analysis and Visualisation**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Basic imports
import numpy as np
import pandas as pd

#sklearn imports
from sklearn.decomposition import PCA #Principal Component Analysis
from sklearn.manifold import TSNE #T-Distributed Stochastic Neighbor Embedding
from sklearn.cluster import KMeans #K-Means Clustering
from sklearn.preprocessing import StandardScaler #used for 'Feature Scaling'

#plotly imports
import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import matplotlib.pyplot as plt

import seaborn as sns
init_notebook_mode(connected=True)
from IPython.display import HTML, Image

?# Load mushroom data

In [None]:
df_raw = pd.read_csv('../input/mushroom-classification/mushrooms.csv')
df_raw.head()

In [None]:
import pandas_profiling as pdpf

profile = pdpf.ProfileReport(df_raw);

<div class="alert alert-block alert-info">
<b>Tip:</b> Use blue boxes (alert-info) for tips and notes. 
</div>

In [None]:
profile.to_widgets()

# Numerical encoding of data

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df = df_raw.apply(LabelEncoder().fit_transform)   
df.head()

In [None]:
df.describe()

# Data Visualization

In [None]:
f, ax = plt.subplots(1, 2, figsize = (15, 7))
df['class'].value_counts().plot.bar(ax=ax[0])
df['class'].value_counts().plot.pie(ax=ax[1], autopct = "%.2f%%");

In [None]:
df.hist(figsize=(15,15));

In [None]:
sns.set(style="white")
dfx = df.loc[:,['gill-color','ring-type','gill-size', 'habitat']]
g = sns.PairGrid(dfx, diag_sharey=False)
g.map_lower(sns.kdeplot, cmap="Blues_d")
g.map_upper(plt.scatter)
g.map_diag(sns.kdeplot, lw=3);

In [None]:
corr=df.corr()

sns.set(font_scale=0.75)
plt.figure(figsize=(35, 15))

sns.heatmap(corr, vmax=.8, linewidths=0.01, square=True,annot=True,cmap='YlGnBu',linecolor="black")
plt.title('Correlation between features');

# Get x, y data

In [None]:
x = df.drop(['class'] , axis = 1)
y = df['class']

# Remove constant features

In [None]:
# using sklearn variancethreshold to find constant features

from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=0)
sel.fit(x)  # fit finds the features with zero variance
# print the constant features
const_columns = [x_ for x_ in x.columns if x_ not in x.columns[sel.get_support()]]
print(const_columns)

In [None]:
x = x.drop(const_columns, axis=1)

# Select best features

In [None]:
# Import SelectKBest, chi2(score function for classification), f_regression (score function for regression)
from sklearn.feature_selection import SelectKBest, chi2, f_regression
# Create the object for SelectKBest and fit and transform the classification data
# k is the number of features you want to select [here it's 2]
X_clf_new = SelectKBest(score_func=chi2, k=2).fit(x,y)
# Get the indices sorted by most important to least important
indices = np.argsort(X_clf_new.scores_)[::-1]

# To get your feature names
features = []
for i in range(x.columns.size):
    features.append(x.columns[indices[i]])

# Now plot
plt.figure()
plt.bar(features, X_clf_new.scores_[indices[range(x.columns.size)]], color='r', align='center')
plt.xticks(rotation=90)
plt.show()

# Split data into training and test

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import plot_confusion_matrix

#Model
DTC = DecisionTreeClassifier(random_state = 10)

#fiting the model
DTC.fit(X_train, y_train);

plot_confusion_matrix(DTC, X_test, y_test, cmap=plt.cm.Blues);

In [None]:
print(DTC.score(X_test , y_test));

In [None]:
from sklearn.tree import export_graphviz
import pydot
feature_list = x.columns.values

# Save the tree as a png image
export_graphviz(DTC, out_file = 'mushrooms_DTC.dot', feature_names = feature_list, rounded = True, precision = 1, filled = True, class_names=['edible','poisonous'])
(graph, ) = pydot.graph_from_dot_file('mushrooms_DTC.dot')
graph.write_png('mushrooms_DTC.png');
Image('mushrooms_DTC.png')

In [None]:
feature_import = pd.DataFrame(data=DTC.feature_importances_, index=feature_list, columns=['values'])
feature_import.sort_values(['values'], ascending=False, inplace=True)
feature_import.transpose()

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_confusion_matrix

#Model
RFC = RandomForestClassifier(n_estimators=10, bootstrap=True, random_state = 0)

#fiting the model
RFC.fit(X_train, y_train);

plot_confusion_matrix(RFC, X_test, y_test, cmap=plt.cm.Blues);

In [None]:
print(RFC.score(X_test , y_test))

In [None]:
# Save the tree as a png image
export_graphviz(RFC.estimators_[0], out_file = 'mushrooms_RFC.dot', feature_names = feature_list, rounded = True, precision = 1, filled = True, class_names=['edible','poisonous'])
(graph, ) = pydot.graph_from_dot_file('mushrooms_RFC.dot')
graph.write_png('mushrooms_RFC.png');
Image('mushrooms_RFC.png')

In [None]:
feature_import = pd.DataFrame(data=DTC.feature_importances_, index=feature_list, columns=['values'])
feature_import.sort_values(['values'], ascending=False, inplace=True)
feature_import.transpose()

# Decisision Tree with 7 best features

In [None]:
features[0:7]

In [None]:
#Model
DTC7 = DecisionTreeClassifier(random_state = 10)

X_train_7 = X_train[features[0:7]]

X_test_7 = X_test[features[0:7]]

#fiting the model
DTC7.fit(X_train_7, y_train);

plot_confusion_matrix(DTC7, X_test_7, y_test, cmap=plt.cm.Blues);

In [None]:
feature_import = pd.DataFrame(data=DTC7.feature_importances_, index=features[0:7], columns=['values'])
feature_import.sort_values(['values'], ascending=False, inplace=True)
feature_import.transpose()

# Deep neural network

In [None]:
import tensorflow as tf
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras import optimizers

def CreateModel(dropout = 0.1): 
    model = tf.keras.Sequential()
    model.add(layers.Dense(128, activation='relu', input_shape=(X_train.values.shape[1],)))
    model.add(layers.Dropout(dropout))
    model.add(layers.Dense(32, activation='relu'))
    model.add(layers.Dropout(dropout))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [None]:
model = CreateModel()
history = model.fit(X_train.values, y_train.values, epochs=10, batch_size=16, validation_data=(X_test.values,y_test.values))

In [None]:
plt.plot(history.history['accuracy'],'bo', label='Trainning acc')
plt.plot(history.history['val_accuracy'],'b', label='Validation acc')
plt.legend();

In [None]:
import shap
background = X_train[0:10].values
explainer = shap.DeepExplainer(model,  background)
shap_values = explainer.shap_values(background)

In [None]:
# Print the feature attributions for the first example in our test set
shap_values[0][0]

In [None]:
Xx,yx = shap.datasets.adult()

In [None]:
X_train

In [None]:
# This is the baseline value shap is using
explainer.expected_value.numpy()

In [None]:
shap.initjs()
shap.force_plot(explainer.expected_value[0].numpy(), shap_values[0][0,:], X_train.iloc[0,:])


In [None]:
shap.force_plot(explainer.expected_value[0].numpy(), shap_values[0][1,:], X_train.iloc[0,:])

In [None]:
plt.xticks(rotation='vertical')
plt.bar(list(X_train.columns), list(shap_values[0][1,:]))

In [None]:
shap.summary_plot(shap_values[0], X_train.columns)

# Tensorflow Feature Columns

In [None]:
# In the original dataset "4" indicates the pet was not adopted.
df_raw['target'] = np.where(df_raw['class']=='e', 0, 1)

In [None]:
train, test = train_test_split(df_raw, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

In [None]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('target')
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

In [None]:
batch_size = 5 # A small batch sized is used for demonstration purposes
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [None]:
for feature_batch, label_batch in train_ds.take(1):
  print('Every feature:', list(feature_batch.keys()))
  print('A batch of targets:', label_batch )

In [None]:
from tensorflow import feature_column

In [None]:
features_columns = []

for i in range(0,7):
    print(features[i])
    feature_col = feature_column.categorical_column_with_vocabulary_list(features[i], df_raw[features[i]].unique())
    feature_col_one_hot = feature_column.indicator_column(feature_col)
    features_columns.append(feature_col_one_hot)

In [None]:
feature_layer = tf.keras.layers.DenseFeatures(features_columns)

In [None]:
batch_size = 32
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [None]:
model = tf.keras.Sequential([
  feature_layer,
  layers.Dense(128, activation='relu'),
  layers.Dense(128, activation='relu'),
  layers.Dropout(.1),
  layers.Dense(1)
])

model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

model.fit(train_ds,
          validation_data=val_ds,
          epochs=5);

In [None]:
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

In [None]:
y_pred = model.predict_classes(test_ds)
y_true = tf.concat([y for x, y in test_ds], axis=0)
con_mat = tf.math.confusion_matrix(labels=y_true, predictions=y_pred).numpy()

In [None]:
figure = plt.figure(figsize=(8, 8))
sns.heatmap(con_mat, annot=True, cmap=plt.cm.Blues, fmt="d")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

# t-SNE

In [None]:
from sklearn.preprocessing import StandardScaler
X_std= StandardScaler().fit_transform(X_train_7)

In [None]:
from sklearn.manifold import TSNE
model = TSNE(n_components = 3, random_state = 0)
tsne_model = model.fit_transform(X_std)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

sns.set(style = "darkgrid")

fig = plt.figure()
ax = Axes3D(fig)

x = tsne_model[:,0]
y = tsne_model[:,1]
z = tsne_model[:,2]

ax.set_xlabel("Dimension 1")
ax.set_ylabel("Dimension 2")
ax.set_zlabel("Dimension 3")

ax.scatter(x, y, z, c=y_train, cmap='magma')

ax.view_init(60, 60)

In [None]:
model = TSNE(n_components = 2, random_state = 0, learning_rate=100)
tsne_model_test = model.fit_transform(X_std)

In [None]:
x = tsne_model_test[:,0]
y = tsne_model_test[:,1]

plt.xlabel("Dimension 1")
plt.ylabel("Dimension 2")

plt.scatter(x, y, c=y_train, cmap='magma');

Attribute Information: (classes: edible=e, poisonous=p)

cap-shape: bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s

cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s

cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y

bruises: bruises=t,no=f

odor: almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s

gill-attachment: attached=a,descending=d,free=f,notched=n

gill-spacing: close=c,crowded=w,distant=d

gill-size: broad=b,narrow=n

gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e,white=w,yellow=y

stalk-shape: enlarging=e,tapering=t

stalk-root: bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r,missing=?

stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s

stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s

stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y

stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y

veil-type: partial=p,universal=u

veil-color: brown=n,orange=o,white=w,yellow=y

ring-number: none=n,one=o,two=t

ring-type: cobwebby=c,evanescent=e,flaring=f,large=l,none=n,pendant=p,sheathing=s,zone=z

spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r,orange=o,purple=u,white=w,yellow=y

population: abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y

habitat: grasses=g,leaves=l,meadows=m,paths=p,urban=u,waste=w,woods=d