In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,f1_score
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.ensemble import AdaBoostClassifier as ABC
%matplotlib inline

In [None]:
data = pd.read_csv("../input/mushrooms.csv")
data.head(10)

In [None]:
labelEncoder = preprocessing.LabelEncoder()
for col in data.columns:
    data[col] = labelEncoder.fit_transform(data[col])

# Splitting test train set, with 20% of the data as the validation set
train, test = train_test_split(data, test_size = 0.2) 

In [None]:
# Train set
train_y = train['class']
train_x = train[[x for x in train.columns if 'class' not in x]]
# Test/Validation set
test_y = test['class']
test_x = test[[x for x in test.columns if 'class' not in x]]

models = [SVC(kernel='rbf', random_state=0), LR(), RF(),LDA(),ABC()]
model_names = ['SVC_rbf', 'Logistic Regression', 'RandomForestClassifier', 'LinearDiscriminantAnalysis','AdaBoostClassifier']
for i, model in enumerate(models):
    model.fit(train_x, train_y)
    print ('The accurancy of ' + model_names[i] + ' is ' + str(accuracy_score(test_y, model.predict(test_x))) )
    print ('The F1_score of ' + model_names[i] + ' is ' + str(f1_score(test_y, model.predict(test_x))) )

**Test 1: Changing the ratio between train and test set **


**Hello, everyone: please change change (data, test_size = 0.99) to (data, test_size = 0.95), (data, test_size = 0.9), (data, test_size = 0.5) to see the result**

In [None]:
train, test = train_test_split(data, test_size = 0.99) #change test_size= 0.99, 0.95, 0.9 to see the result#

In [None]:
# Train set
train_y = train['class']
train_x = train[[x for x in train.columns if 'class' not in x]]
# Test/Validation set
test_y = test['class']
test_x = test[[x for x in test.columns if 'class' not in x]]

models = [SVC(kernel='rbf', random_state=0), LR(), RF(),LDA(),ABC()]
model_names = ['SVC_rbf', 'Logistic Regression', 'RandomForestClassifier', 'LinearDiscriminantAnalysis','AdaBoostClassifier']
for i, model in enumerate(models):
    model.fit(train_x, train_y)
    print ('The accurancy of ' + model_names[i] + ' is ' + str(accuracy_score(test_y, model.predict(test_x))) )
    print ('The F1_score of ' + model_names[i] + ' is ' + str(f1_score(test_y, model.predict(test_x))) )

**QUESTION: which algorithm (SVC_rbf,Logistic Regression,RandomForestClassifier,LinearDiscriminantAnalysis,AdaBoostClassifier) is more robust (use fewer train size can achieve the highest value of accuracy and f1) and which is not ? **


**Test 2: Changing the hyperparameter of each method**

In [None]:
train, test = train_test_split(data, test_size = 0.2) 

Hello, everyone try change LR(C=1.0) to LR(C=0.5), LR(C=0.01), try change (n_estimators=100) to (n_estimators=1), (n_estimators=10), try change LDA(solver='lsqr') to LDA(solver='svd'') or LDA(solver='eigan''), try change (n_estimators=1000) to (n_estimators=100), (n_estimators=10)


In [None]:
# Train set
train_y = train['class']
train_x = train[[x for x in train.columns if 'class' not in x]]
# Test/Validation set
test_y = test['class']
test_x = test[[x for x in test.columns if 'class' not in x]]
SVCm=SVC()
LRm=LR(C=1.0)#"Please try between 0 and 1"
RFm=RF(n_estimators=100)#"Please try between 1 and 100"
LDAm=LDA(solver='lsqr')#"Please try ‘svd’,‘lsqr’, or ‘eigen’:"
ABCm=ABC(n_estimators=1000)#"Please try between 1 and 1000"
models = [SVCm, LRm, RFm,LDAm,ABCm]
model_names = ['SVC_rbf', 'Logistic Regression', 'RandomForestClassifier', 'LinearDiscriminantAnalysis','AdaBoostClassifier']
resultdict=dict(modelname=[],accuracy=[],f1=[])
for i, model in enumerate(models):
    model.fit(train_x, train_y)
    resultdict['modelname'].append(model_names[i])
    resultdict['accuracy'].append(accuracy_score(test_y, model.predict(test_x)))
    resultdict['f1'].append(f1_score(test_y, model.predict(test_x)))

In [None]:
resultdict1=pd.DataFrame.from_dict(resultdict)

In [None]:
plot=resultdict1.plot.bar(x='modelname',rot=15, subplots=True)
plot

Question 2: When you change those parameters, did you see some difference, if not, please guess what happen.