In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
data = pd.read_csv('/kaggle/input/mushroom-classification/mushrooms.csv')
data.dtypes

import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score

plt.hist(data['class'])
plt.tick_params(axis='x', colors='white')
plt.tick_params(axis='y', colors='white')
plt.title('class_distribution', color='white')
plt.grid()
plt.show()

data.apply(lambda x: len(np.unique(x))).sort_values(ascending=False)



data.rename({'class': 
            'target'}, axis=1, inplace=True)

X, y = data.iloc[:, 1:], data['target']

le = LabelEncoder()

X = X.apply(le.fit_transform)
y = LabelEncoder().fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=1, stratify=y)


pipeline = Pipeline([
             ('clf', RandomForestClassifier())
    ])

pipeline.fit(X_train, y_train)
pipeline.score(X_test, y_test)

scores = cross_val_score(pipeline, X_train, y_train, cv=10, scoring='accuracy')

(pipeline.predict(X_test) == y_test).all()

from sklearn.model_selection import learning_curve

train_sizes, train_scores, test_scores =\
                            learning_curve(estimator=pipeline, 
                                           X=X_train, 
                                           y=y_train, 
                                           train_sizes=np.linspace(0.1, 1.0, 10), 
                                           cv=10,
                                           n_jobs=1)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.plot(train_sizes, train_mean, 
         color='blue', marker='o',
         markersize=5, label='training_accuracy')

plt.fill_between(train_sizes, 
                 train_mean + train_std, 
                 train_mean - train_std, 
                 alpha=0.15, color='blue')

plt.plot(train_sizes, test_mean, 
         color='green', linestyle='--', 
         marker='s', markersize=5, 
         label='validation accuracy')

plt.fill_between(train_sizes, 
                 test_mean+test_std, 
                 test_mean-test_std, 
                 alpha=0.15, color='green')

plt.grid()
plt.xlabel('Number of trainning samples')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim([0.8, 1.1])
plt.show()

from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(y_test, pipeline.predict(X_test)))

print(classification_report(y_test, pipeline.predict(X_test)))