# demo 1
use seed_data to do a classification problem using base and ensemble models. for this purpose, consider **target** as the output variable: 

1. write a cell to define base and ensemble classifiers 

2. make a dictionary of available classifiers 

3.  propose a validator to evaluate the result of each classifier 

4. visualize the deep insights using boxplot. 

5. validate the results and deploy the best model  


In [None]:
%matplotlib inline

import itertools
import numpy as np
from statistics import mean, stdev 

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
 
# base classifiers 
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
 
# ensemble classifiers 
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, StackingClassifier  
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, recall_score,confusion_matrix

from mlxtend.plotting import plot_learning_curves
from mlxtend.plotting import plot_decision_regions
 

In [None]:
from sklearn.metrics._plot.confusion_matrix import confusion_matrix
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)
model= GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
cm = confusion_matrix(y_test,y_pred)
print(cm)
re = recall_score(y_test,y_pred, average='micro')
print(re)
TP=cm[0,0]+cm[1,1]+cm[2,2]; FN=cm[1,0]+cm[2,0]+cm[2,1]
TP/(TP+FN+FP)

[[22  2  2]
 [ 0 14  0]
 [ 1  0 22]]
0.9206349206349206


0.9830508474576272

write a cell to define base and ensemble classifiers

In [None]:
# get a list of models to evaluate
def base_models():
  models = dict() 
  models['bayes'] = GaussianNB()
  models['logistic']=LogisticRegression()
  models['dt_ent']=DecisionTreeClassifier(criterion='entropy')
  models['dt_gini']=DecisionTreeClassifier() # gini
  models['svc_lin'] = SVC(kernel='linear')
  models['svc_sig'] = SVC(kernel='sigmoid')
  models['svc_poly'] = SVC(kernel='poly')
  models['svc_rbf'] = SVC()

  return models

In [None]:

def validator(model, X, y):
  Acc=[]
  acc = cross_val_score(model,X,y,scoring="recall_weighted",cv=5) 
  Acc.append(acc)

  return Acc

In [None]:
from pandas import DataFrame, read_csv
data=read_csv('/content/Seed_Data.csv')
data.head()
y=data['target']; X=data.drop('target',axis=1)

In [None]:
# get the models to evaluate
from numpy import mean, std
import warnings
warnings.filterwarnings("ignore")
models=base_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
	scores = validator(model, X, y)
	results.append(scores)
	names.append(name)
	print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

>bayes 0.886 (0.103)
>logistic 0.933 (0.046)
>dt_ent 0.890 (0.070)
>dt_gini 0.881 (0.040)
>svc_lin 0.900 (0.108)
>svc_sig 0.000 (0.000)
>svc_poly 0.895 (0.081)
>svc_rbf 0.886 (0.111)


# conclusion
k fold cross validation is limited for the recall score, instead we use Monte Carlo sampling technique

In [None]:
# create the function to evaluate the models in terms of accuracy 
import numpy as np

def evaluate_modelr(model, X, y):
  re = []
  for j in range(100):
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3, random_state = j)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    recal = recall_score(y_test,y_pred, average='weighted')
    re.append(recal)
    Re=np.mean(re) 
    return Re

In [None]:
# get the models to evaluate
from numpy import mean, std
import warnings
warnings.filterwarnings("ignore")
models=base_models()
# evaluate the models and store results
print('recall score for different base models is:')
results, names = list(), list()
for name, model in models.items():
	scores = evaluate_modelr(model, X, y)
	results.append(scores)
	names.append(name)
  
	print('>%s , (%.3f)' % (name, scores))

recall score for different base models is:
>bayes , (0.889)
>logistic , (0.889)
>dt_ent , (0.937)
>dt_gini , (0.921)
>svc_lin , (0.905)
>svc_sig , (0.270)
>svc_poly , (0.873)
>svc_rbf , (0.873)


In [None]:
names, results

In [None]:
# plot model performance for comparison
plt.boxplot(results, labels=names, showmeans=True)
plt.show()

In [None]:
from numpy import mean, std
import warnings; warnings.filterwarnings('ignore')

models=base_models()
results, names = list(), list()
for name, model in models.items():
	scores = evaluate_modelr(model, X, y)
	results.append(scores)
	names.append(name)
	print( DataFrame(scores))

In [None]:
df=DataFrame(scores)
df

Unnamed: 0,0,1,2,3,4
0,0.97619,0.928571,0.928571,0.928571,0.666667


In [None]:
# get a stacking ensemble of models
def get_stacking():
	# define the base models: level0 models
	level0 = list()
	level0.append(('lr', LogisticRegression())) # model A
	level0.append(('dt', DecisionTreeClassifier())) # model B
	level0.append(('nb', SVC(kernel='linear'))) # model C
	# define meta learner model: level 1
	level1 = LogisticRegression()
	# define the stacking ensemble
	model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
	return model

In [None]:
# get a list of models to evaluate

def en_models():
  models = dict() 
  lr=LogisticRegression()
  models['bagc'] = BaggingClassifier(base_estimator=lr, n_estimators=50, max_samples=0.8, max_features=0.8)
  models['boosting'] = AdaBoostClassifier(base_estimator=lr, n_estimators=10) # 10 steps
  models['stack'] =get_stacking() 
  return models

In [None]:
# get the models to evaluate
from numpy import mean, std
import warnings
warnings.filterwarnings("ignore")
models=en_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
	scores = validator(model, X, y)
	results.append(scores)
	names.append(name)
	print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

>bagc 0.905 (0.088)
>boosting 0.890 (0.101)
>stack 0.881 (0.089)


#Demo 2

apply base regression model to predict close price in terms of other variables.For this purpose, redo above dataflow for the regression task

In [None]:
from pandas_datareader import data
import matplotlib.pyplot as plt

In [None]:
!pip install yfinance

In [None]:
import yfinance as yahooFinance
import datetime
# startDate , as per our convenience we can modify
startDate = datetime.datetime(2019, 5, 31)
 
# endDate , as per our convenience we can modify
endDate = datetime.datetime(2021, 1, 30)
apl = yahooFinance.Ticker("AAPL")
 
# pass the parameters as the taken dates for start and end
data=apl.history(start=startDate,end=endDate)
data.head()
data.shape

(421, 7)