In [13]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pylab import savefig
from scipy.io import arff
from google.colab import drive
drive.mount('/content/gdrive')
import ntpath
import glob
import os
import math
# !pip install liac-arff
#import arff

from sklearn.model_selection import train_test_split
from random import randrange
from itertools import combinations as comb
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
import string
import re
import nltk
nltk.download('stopwords')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
def preprocess(listoftext):
    customstopwords = nltk.corpus.stopwords.words('english')
    customstopwords.append('rt')
    emojipattern = re.compile("["
                              u"\U0001F600-\U0001F64F"
                              u"\U0001F300-\U0001F5FF"
                              u"\U0001F680-\U0001F6FF"
                              u"\U0001F1E0-\U0001F1FF"
                              u"\U00000430-\U00000648"
                              u"\U0000263a-\U0000fe0f"
                              u"\U0000201c-\U0000201d"
                              u"\U00002000-\U000020e3"
                              u"\U0000064a-\U0000064d"
                              u"\U0000ff00-\U0000ff09"
                              u"\U000fe520-\U000fe529"
                              u"\U0000221b-\U0000221e"
                              u"\U000feb90-\U000feb99"
                              u"\U00001d20-\U00001d4c"
                              u"\U00000080-\U000000FF"
                              u"\U00000100-\U00000139"
                              u"\U0000FF80-\U0001007F""]+", flags=re.UNICODE)
    httppattern = re.compile('http?(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')

    punctuationremoved = [char for char in listoftext if char not in  string.punctuation]
    punctuationremoved = ''.join(punctuationremoved)

    advancedremoved = emojipattern.sub(r'', punctuationremoved)
    advancedremoved = httppattern.sub(r'', advancedremoved)

    stopwordsremoved = [word for word in advancedremoved.split() if word.lower() not in customstopwords]
    return stopwordsremoved

# <h1> Datasets</h1>




In [17]:
ds_folderpath = "gdrive/MyDrive/ImprovedSpace/arfffiles/"
ds_files = glob.glob(ds_folderpath + "/*.arff")
datasets = {} # dictionary
for file_path in ds_files:
  filename = os.path.splitext(ntpath.basename(file_path))[0]
  if filename != "cnae" and filename != "cnae_utf": # cnae veriseti utf olmadığı için, eskisi yerine aynı içerikte, utf encoded hali kaydedidi: cnae_utf.arff 
    filedata, meta = arff.loadarff(file_path)
    attrNames = meta.names()
    classname = attrNames[-1]
    attrnames = attrNames[0:-1]
    
    data = pd.DataFrame(filedata)
    samples = data[attrnames]
    labels = data[classname]
    datasets[filename] = (samples, labels)


In [18]:
ds_folderpath = "gdrive/MyDrive/ImprovedSpace/"
ds_files = glob.glob(ds_folderpath + "/")

datasets["iphonetweets"] = pd.read_csv(ds_folderpath+'iphonetweets.csv', sep=',', names=['text', 'label'], usecols=['text', 'label'])
datasets["hobbittweets"] = pd.read_csv(ds_folderpath+'Hobbittweets.csv', sep=',', names=['text', 'label'],usecols=['text', 'label'])

Functions 
----------------

In [5]:


def x2fx(x, model='quadratic'):
    linear = np.c_[np.ones(x.shape[0]), x]
    if model == 'linear':
        return linear
    if model == 'purequadratic':
        return np.c_[linear, x**2]
    interaction = np.array([x[:,i]*x[:,j] for i, j in comb(range(x.shape[1]), 2)]).T
    if model == 'interaction':
        return np.c_[linear, interaction]
    if model == 'quadratic':
        return np.c_[linear, interaction, x**2]

def generate_imp_space(X_train, Y_train, X_test, imp_feature_size, foz):
  imp_train_data = X_train.values
  imp_test_data = X_test.values
  d = len(X_train.columns)
  # print("____1_____")
  for i in range(0,imp_feature_size*foz):
    Xindis = np.random.permutation(d)
    for j in range(0,d-(foz-1),foz):  #d/foz kadar doner
      sX = np.random.permutation(num_class)
      s1 = sX[0]
      # print("____2_____")
      s1data = X_train[X_train.index.isin(Y_train[Y_train == str(s1)].index)]
      s2data = X_train[~X_train.index.isin(Y_train[Y_train == str(s1)].index)]
      s1data = s1data.iloc[:,Xindis[j:j+(foz)]]
      s2data = s2data.iloc[:,Xindis[j:j+(foz)]] # s1 vs all other classes, #foz feature
      # print("____3_____")
      s1label = np.ones((s1data.values.shape[0],1),dtype=int)
      s2label = -1*np.ones((s2data.values.shape[0],1),dtype=int)
      Wdata = np.concatenate((s1data,s2data))
      # print("____4_____")
      
      Wdata = x2fx(Wdata)
      Wlabel = np.concatenate((s1label,s2label))
      W = np.matmul(np.matmul(np.linalg.pinv(np.matmul(Wdata.T, Wdata)),Wdata.T),Wlabel)
      
      WW = x2fx(X_train.iloc[:,Xindis[j:j+(foz)]].values)
      imp_train_data = np.concatenate((imp_train_data, np.matmul(WW,W)),axis=1)
      
      TT = x2fx(X_test.iloc[:,Xindis[j:j+(foz)]].values)
      imp_test_data = np.concatenate((imp_test_data, np.matmul(TT,W)),axis=1)
    
  return imp_train_data,imp_test_data

# Create a random subsample from the dataset with replacement

def subsample(X, X_imp, Y, ratio):
    xsample = list()
    ximpsample = list()
    labels = list()
    n_sample = round(len(X) * ratio)
    while len(xsample) < n_sample:
        index = randrange(len(X))
        xsample.append(X[index])
        ximpsample.append(X_imp[index])
        labels.append(Y[index])
    return np.array(xsample),np.array(ximpsample),np.array(labels)


def MajorityVoting(votes):
  results = []
  for i in range(0,votes.shape[1]):
    values, counts = np.unique(votes[:,i], return_counts=True)
    results.append(values[np.argmax(counts)])
  return np.array(results)

Random Forest
----------------------------

In [None]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.model_selection import RepeatedKFold


foz=4
imp_feature_size=1
n_estimators = 10
#Table
from prettytable import PrettyTable
    
pt_RF = PrettyTable()

pt_RF.field_names = ["Dataset", "Imp_RF", "RF"]
#
foldPreds_RF = {}
K =0
for ds in datasets:
  K+=1
  rkf = RepeatedKFold(n_splits=2, n_repeats=5, random_state=2652124)
  if ds == "iphonetweets" or ds=="hobbittweets":
    df = datasets[ds]
    X = df['text']
    Y = df['label']
  else:
    X,Y = datasets[ds]
    Y = Y.str.decode("utf-8")
    
  # X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5, random_state=42)
  print(ds)
  accuracies = []
  accuracies_imp = []  
  for train_index, test_index in rkf.split(X):

    # X_train, X_test = X[train_index], X[test_index]
    # Y_train, Y_test = Y[train_index], Y[test_index]
    X_train = X.iloc[train_index]
    X_test = X.iloc[test_index]
    Y_train = Y.iloc[train_index]
    Y_test = Y.iloc[test_index]

    if ds == "iphonetweets" or ds=="hobbittweets":
      vectorizer = CountVectorizer(analyzer=preprocess)
      vectorizer.fit(X_train)

      X_train = vectorizer.transform(X_train).toarray()
      X_test = vectorizer.transform(X_test).toarray()
      X_train = pd.DataFrame(X_train)
      X_test = pd.DataFrame(X_test)

    d = len(X_train.columns)
    num_class = len(Y_train.value_counts())


    imp_tree_predicts = []
    tree_predicts = []

    for i in range(0,n_estimators):
      imp_tr, imp_ts = generate_imp_space(X_train, Y_train, X_test, imp_feature_size, foz)
      imp_d = imp_tr.shape[1]

      #meta learner params
      imp_sel_d = 2* round(math.log2(imp_d)) #feature
      sel_d = 2*round(math.log2(d))
      
      imp_tree = RandomForestClassifier(max_features=imp_sel_d, n_estimators=1)#,random_state=42
      imp_tree.fit(imp_tr, Y_train)
      imp_tree_predicts.append(imp_tree.predict(imp_ts))


      tree = RandomForestClassifier(max_features=sel_d, n_estimators=1)#, random_state=42
      tree.fit(X_train, Y_train)
      tree_predicts.append(tree.predict(X_test))

    results_imp = MajorityVoting(np.array(imp_tree_predicts))
    results = MajorityVoting(np.array(tree_predicts))

    # print("--------------  {}  ----------------".format(ds.upper()))
    accuracies.append(accuracy_score(Y_test.values, results))
    accuracies_imp.append(accuracy_score(Y_test.values, results_imp))
    
    pt_RF.add_row((ds, "%.4f" % accuracy_score(Y_test.values, results_imp),  "%.4f" % accuracy_score(Y_test.values, results)))
  foldPreds_RF[ds] = accuracies
  foldPreds_RF[ds+"_imp"] = accuracies_imp

In [None]:
print(pt_RF) 

In [None]:
np.save(ds_folderpath + "/" + 'RF.npy',foldPreds_RF ) 

Bagging

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import RepeatedKFold
foz=4
imp_feature_size=1
n_estimators = 10
#Table
from prettytable import PrettyTable
    
pt_Bagging = PrettyTable()

pt_Bagging.field_names = ["Dataset", "Imp_Bagging", "Bagging"]
#
foldPreds_Bagging = {}
K =0
for ds in datasets:
  K+=1
  rkf = RepeatedKFold(n_splits=2, n_repeats=5, random_state=2652124)
  if ds == "iphonetweets" or ds=="hobbittweets":
    df = datasets[ds]
    X = df['text']
    Y = df['label']
  else:
    X,Y = datasets[ds]
    Y = Y.str.decode("utf-8")
    
  # X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5, random_state=42)
  print(ds)
  accuracies = []
  accuracies_imp = []  
  for train_index, test_index in rkf.split(X):

    X_train = X.iloc[train_index]
    X_test = X.iloc[test_index]
    Y_train = Y.iloc[train_index]
    Y_test = Y.iloc[test_index]

    if ds == "iphonetweets" or ds=="hobbittweets":
      vectorizer = CountVectorizer(analyzer=preprocess)
      vectorizer.fit(X_train)

      X_train = vectorizer.transform(X_train).toarray()
      X_test = vectorizer.transform(X_test).toarray()
      X_train = pd.DataFrame(X_train)
      X_test = pd.DataFrame(X_test)

    d = len(X_train.columns)
    num_class = len(Y_train.value_counts())


    imp_tree_predicts = []
    tree_predicts = []

    for i in range(0,n_estimators):
      imp_tr, imp_ts = generate_imp_space(X_train, Y_train, X_test, imp_feature_size, foz)
      imp_d = imp_tr.shape[1]   
      
      imp_tree = BaggingClassifier(n_estimators=1)#,random_state=42
      imp_tree.fit(imp_tr, Y_train)
      imp_tree_predicts.append(imp_tree.predict(imp_ts))


      tree = BaggingClassifier(n_estimators=1)#, random_state=42
      tree.fit(X_train, Y_train)
      tree_predicts.append(tree.predict(X_test))

    results_imp = MajorityVoting(np.array(imp_tree_predicts))
    results = MajorityVoting(np.array(tree_predicts))

    # print("--------------  {}  ----------------".format(ds.upper()))
    accuracies.append(accuracy_score(Y_test.values, results))
    accuracies_imp.append(accuracy_score(Y_test.values, results_imp))
    # print(" ")
    pt_Bagging.add_row((ds, "%.4f" % accuracy_score(Y_test.values, results_imp),  "%.4f" % accuracy_score(Y_test.values, results)))

  foldPreds_Bagging[ds] = accuracies
  foldPreds_Bagging[ds+"_imp"] = accuracies_imp

In [None]:
print(pt_Bagging) 

In [12]:
np.save(ds_folderpath + "/" + 'bagging.npy',foldPreds_Bagging ) 

Sklearn Adaboost Fonksiyonları
------------------------------

In [6]:
from abc import ABCMeta, abstractmethod
import numpy as np

from scipy.special import xlogy

from sklearn.ensemble import BaseEnsemble
from sklearn.base import ClassifierMixin, RegressorMixin, is_classifier, is_regressor

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.utils import check_array, check_random_state, _safe_indexing
from sklearn.utils.extmath import softmax, stable_cumsum
from sklearn.metrics import accuracy_score, r2_score
from sklearn.utils.validation import check_is_fitted, _check_sample_weight,has_fit_parameter,_num_samples, _deprecate_positional_args

# __all__ = [
#     'AdaBoostClassifier',
#     'AdaBoostRegressor',
# ]

class BaseWeightBoosting_IMP(BaseEnsemble, metaclass=ABCMeta):
    @abstractmethod
    def __init__(self,
                 base_estimator=None, *,
                 n_estimators=50,
                 estimator_params=tuple(),
                 learning_rate=1.,
                 random_state=None):

        super().__init__(
            base_estimator=base_estimator,
            n_estimators=n_estimators,
            estimator_params=estimator_params)

        self.learning_rate = learning_rate
        self.random_state = random_state

    def _check_X(self, X):
        return check_array(X, accept_sparse=['csr', 'csc'], ensure_2d=True,
                           allow_nd=True, dtype=None)

    def fit(self, X, X_test, y, sample_weight=None):
        # Check parameters
        if self.learning_rate <= 0:
            raise ValueError("learning_rate must be greater than zero")
      #  print("_2_")
        # X, y = self._validate_data(X, y,
        #                            accept_sparse=['csr', 'csc'],
        #                            ensure_2d=True,
        #                            allow_nd=True,
        #                            dtype=None,
        #                            y_numeric=is_regressor(self))
        #print("_3_")

        sample_weight = _check_sample_weight(sample_weight, X, np.float64)
        sample_weight /= sample_weight.sum()
        if np.any(sample_weight < 0):
            raise ValueError("sample_weight cannot contain negative weights")
        #print("_4_")
        # Check parameters
        self._validate_estimator()

        # Clear any previous fit results
        self.estimators_ = []
        self.estimator_weights_ = np.zeros(self.n_estimators, dtype=np.float64)
        self.estimator_errors_ = np.ones(self.n_estimators, dtype=np.float64)
        #print("_5_")
        # Initializion of the random number instance that will be used to
        # generate a seed at each iteration
        random_state = check_random_state(self.random_state)
        #print("_6_")
        c = 0
        for iboost in range(self.n_estimators):
            c+=1
           # print("_7_", str(c))
            # dfx = pd.DataFrame(data=X)
            # dfy = pd.DataFrame(data=y)
            # dfx_test = pd.DataFrame(data=X_test)
           # print("_7_a", str(c))

            # print(dfx.values.shape)
            # print(dfy.values.shape)
            # print(dfx_test.values.shape)
            # print(type(X))
            # print(type(y))

            #HER TEKIL AGAC OLUSTURULMADAN ONCE IMPROVED SPACE
            imp_tr, imp_ts = generate_imp_space(X, y, X_test, imp_feature_size=1, foz=4)
           # print("_7_b", str(c))
            # Boosting step
            sample_weight, estimator_weight, estimator_error = self._boost(
                iboost,
                imp_tr, imp_ts, y,
                sample_weight,
                random_state)
           # print("_7_c", str(c))
            # Early termination
            if sample_weight is None:
                break

            self.estimator_weights_[iboost] = estimator_weight
            self.estimator_errors_[iboost] = estimator_error

            # Stop if error is zero
            if estimator_error == 0:
                break

            sample_weight_sum = np.sum(sample_weight)

            # Stop if the sum of sample weights has become non-positive
            if sample_weight_sum <= 0:
                break

            if iboost < self.n_estimators - 1:
                # Normalize
                sample_weight /= sample_weight_sum

        return self

    @abstractmethod
    def _boost(self, iboost, X, y, sample_weight, random_state):
       # print("_8_")
        pass

    def staged_score(self, X, y, sample_weight=None):
        X = self._check_X(X)

        for y_pred in self.staged_predict(X):
            if is_classifier(self):
                yield accuracy_score(y, y_pred, sample_weight=sample_weight)
            else:
                yield r2_score(y, y_pred, sample_weight=sample_weight)

    @property
    def feature_importances_(self):
        if self.estimators_ is None or len(self.estimators_) == 0:
            raise ValueError("Estimator not fitted, "
                             "call `fit` before `feature_importances_`.")

        try:
            norm = self.estimator_weights_.sum()
            return (sum(weight * clf.feature_importances_ for weight, clf
                    in zip(self.estimator_weights_, self.estimators_))
                    / norm)

        except AttributeError as e:
            raise AttributeError(
                "Unable to compute feature importances "
                "since base_estimator does not have a "
                "feature_importances_ attribute") from e


    def _samme_proba(self, estimator, n_classes, X):
        proba = estimator.predict_proba(X)

        # Displace zero probabilities so the log is defined.
        # Also fix negative elements which may occur with
        # negative sample weights.
        np.clip(proba, np.finfo(proba.dtype).eps, None, out=proba)
        log_proba = np.log(proba)

        return (n_classes - 1) * (log_proba - (1. / n_classes)
                                  * log_proba.sum(axis=1)[:, np.newaxis])


################################################  ADABOOST  ########################################################################

class AdaBoostClassifier_IMP(ClassifierMixin, BaseWeightBoosting_IMP):
    @_deprecate_positional_args
    def __init__(self,
                 base_estimator=None, *,
                 n_estimators=50,
                 learning_rate=1.,
                 algorithm='SAMME.R',
                 testpredictions = [],
                 random_state=None):

        super().__init__(
            base_estimator=base_estimator,
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            random_state=random_state)

        self.algorithm = algorithm
        self.testpredictions = testpredictions

    def fit(self, X, X_test, y, sample_weight=None):
        # Check that algorithm is supported
        if self.algorithm not in ('SAMME', 'SAMME.R'):
            raise ValueError("algorithm %s is not supported" % self.algorithm)
      #  print("_1_")
        # Fit
        self.testpredictions = []
        return super().fit(X, X_test, y, sample_weight)

    def _validate_estimator(self):
        """Check the estimator and set the base_estimator_ attribute."""
        super()._validate_estimator(
            default=DecisionTreeClassifier(max_depth=1))

        #  SAMME-R requires predict_proba-enabled base estimators
        if self.algorithm == 'SAMME.R':
            if not hasattr(self.base_estimator_, 'predict_proba'):
                raise TypeError(
                    "AdaBoostClassifier with algorithm='SAMME.R' requires "
                    "that the weak learner supports the calculation of class "
                    "probabilities with a predict_proba method.\n"
                    "Please change the base estimator or set "
                    "algorithm='SAMME' instead.")
        if not has_fit_parameter(self.base_estimator_, "sample_weight"):
            raise ValueError("%s doesn't support sample_weight."
                             % self.base_estimator_.__class__.__name__)

    def _boost(self, iboost, X, X_test, y, sample_weight, random_state):
        if self.algorithm == 'SAMME.R':
       #     print("_9_1_")
            return self._boost_real(iboost, X, X_test, y, sample_weight, random_state)

        else:  # elif self.algorithm == "SAMME":
        #    print("_9_2_")
            return self._boost_discrete(iboost, X, X_test, y, sample_weight,
                                        random_state)

    def _boost_real(self, iboost, X, X_test, y, sample_weight, random_state):
        """Implement a single boost using the SAMME.R real algorithm."""
        estimator = self._make_estimator(random_state=random_state)

        estimator.fit(X, y, sample_weight=sample_weight)

        y_predict_proba = estimator.predict_proba(X)

        if iboost == 0:
            self.classes_ = getattr(estimator, 'classes_', None)
            self.n_classes_ = len(self.classes_)

        y_predict = self.classes_.take(np.argmax(y_predict_proba, axis=1),
                                       axis=0)

        n_classes = self.n_classes_
        classes = self.classes_[:, np.newaxis]

        if self.algorithm == 'SAMME.R':#
            preds = super()._samme_proba(estimator, n_classes, X_test)
            self.testpredictions.append(preds) #
            # The weights are all 1. for SAMME.R
            # pred = sum(_samme_proba(estimator, n_classes, X)
            #             for estimator in self.estimators_)
        else:  # self.algorithm == "SAMME" #
            self.testpredictions.append(estimator.predict(X_test)) #
            # pred = sum((estimator.predict(X) == classes).T * w
            #             for estimator, w in zip(self.estimators_,
            #                                     self.estimator_weights_))
            
        # Instances incorrectly classified
        incorrect = y_predict != y

        # Error fraction
        estimator_error = np.mean(
            np.average(incorrect, weights=sample_weight, axis=0))

        # Stop if classification is perfect
        if estimator_error <= 0:
            return sample_weight, 1., 0.

        # Construct y coding as described in Zhu et al [2]:
        #
        #    y_k = 1 if c == k else -1 / (K - 1)
        #
        # where K == n_classes_ and c, k in [0, K) are indices along the second
        # axis of the y coding with c being the index corresponding to the true
        # class label.
        n_classes = self.n_classes_
        classes = self.classes_
        y_codes = np.array([-1. / (n_classes - 1), 1.])
        y_coding = y_codes.take(classes == y[:, np.newaxis])

        # Displace zero probabilities so the log is defined.
        # Also fix negative elements which may occur with
        # negative sample weights.
        proba = y_predict_proba  # alias for readability
        np.clip(proba, np.finfo(proba.dtype).eps, None, out=proba)

        # Boost weight using multi-class AdaBoost SAMME.R alg
        estimator_weight = (-1. * self.learning_rate
                            * ((n_classes - 1.) / n_classes)
                            * xlogy(y_coding, y_predict_proba).sum(axis=1))

        # Only boost the weights if it will fit again
        if not iboost == self.n_estimators - 1:
            # Only boost positive weights
            sample_weight *= np.exp(estimator_weight *
                                    ((sample_weight > 0) |
                                     (estimator_weight < 0)))

        return sample_weight, 1., estimator_error

    def _boost_discrete(self, iboost, X, X_test, y, sample_weight, random_state):
        """Implement a single boost using the SAMME discrete algorithm."""
        estimator = self._make_estimator(random_state=random_state)

        estimator.fit(X, y, sample_weight=sample_weight)

        y_predict = estimator.predict(X)

        test_predict_proba = estimator.predict_proba(X_test) #
        test_predict = self.classes_.take(np.argmax(test_predict_proba, axis=1),
                                       axis=0) #
        self.predictions.append(test_predict)#


        if iboost == 0:
            self.classes_ = getattr(estimator, 'classes_', None)
            self.n_classes_ = len(self.classes_)

        # Instances incorrectly classified
        incorrect = y_predict != y

        # Error fraction
        estimator_error = np.mean(
            np.average(incorrect, weights=sample_weight, axis=0))

        # Stop if classification is perfect
        if estimator_error <= 0:
            return sample_weight, 1., 0.

        n_classes = self.n_classes_

        # Stop if the error is at least as bad as random guessing
        if estimator_error >= 1. - (1. / n_classes):
            self.estimators_.pop(-1)
            if len(self.estimators_) == 0:
                raise ValueError('BaseClassifier in AdaBoostClassifier '
                                 'ensemble is worse than random, ensemble '
                                 'can not be fit.')
            return None, None, None

        # Boost weight using multi-class AdaBoost SAMME alg
        estimator_weight = self.learning_rate * (
            np.log((1. - estimator_error) / estimator_error) +
            np.log(n_classes - 1.))

        # Only boost the weights if I will fit again
        if not iboost == self.n_estimators - 1:
            # Only boost positive weights
            sample_weight *= np.exp(estimator_weight * incorrect *
                                    (sample_weight > 0))

        return sample_weight, estimator_weight, estimator_error

    def predict(self, X):
        X = self._check_X(X)

        pred = self.decision_function(X)

        if self.n_classes_ == 2:
            return self.classes_.take(pred > 0, axis=0)

        return self.classes_.take(np.argmax(pred, axis=1), axis=0)
    
    def predict(self): #
        # X = self._check_X(X)

        pred = self.decision_function_test()

        if self.n_classes_ == 2:
            return self.classes_.take(pred > 0, axis=0)

        return self.classes_.take(np.argmax(pred, axis=1), axis=0)

    def staged_predict(self, X):
        X = self._check_X(X)

        n_classes = self.n_classes_
        classes = self.classes_

        if n_classes == 2:
            for pred in self.staged_decision_function(X):
                yield np.array(classes.take(pred > 0, axis=0))

        else:
            for pred in self.staged_decision_function(X):
                yield np.array(classes.take(
                    np.argmax(pred, axis=1), axis=0))
                
    def decision_function_test(self): #
      check_is_fitted(self)
      # X = self._check_X(X)

      n_classes = self.n_classes_
      classes = self.classes_[:, np.newaxis]

      if self.algorithm == 'SAMME.R':
          pred = sum(predicts for predicts in self.testpredictions)
      else:
          pred = sum( (predicts == classes).T * w 
                     for predicts, w in zip(self.testpredictions, 
                                            self.estimator_weights_))
      # if self.algorithm == 'SAMME.R':
      #     # The weights are all 1. for SAMME.R
      #     pred = sum(_samme_proba(estimator, n_classes, X)
      #                 for estimator in self.estimators_)
      # else:  # self.algorithm == "SAMME"
      #     pred = sum((estimator.predict(X) == classes).T * w
      #                 for estimator, w in zip(self.estimators_,
      #                                         self.estimator_weights_))

      pred /= self.estimator_weights_.sum()
      if n_classes == 2:
          pred[:, 0] *= -1
          return pred.sum(axis=1)
      return pred


    def decision_function(self, X):
        check_is_fitted(self)
        X = self._check_X(X)

        n_classes = self.n_classes_
        classes = self.classes_[:, np.newaxis]

        if self.algorithm == 'SAMME.R':
            # The weights are all 1. for SAMME.R
            pred = sum(_samme_proba(estimator, n_classes, X)
                       for estimator in self.estimators_)
        else:  # self.algorithm == "SAMME"
            pred = sum((estimator.predict(X) == classes).T * w
                       for estimator, w in zip(self.estimators_,
                                               self.estimator_weights_))

        pred /= self.estimator_weights_.sum()
        if n_classes == 2:
            pred[:, 0] *= -1
            return pred.sum(axis=1)
        return pred

    def staged_decision_function(self, X):
        check_is_fitted(self)
        X = self._check_X(X)

        n_classes = self.n_classes_
        classes = self.classes_[:, np.newaxis]
        pred = None
        norm = 0.

        for weight, estimator in zip(self.estimator_weights_,
                                     self.estimators_):
            norm += weight

            if self.algorithm == 'SAMME.R':
                # The weights are all 1. for SAMME.R
                current_pred = _samme_proba(estimator, n_classes, X)
            else:  # elif self.algorithm == "SAMME":
                current_pred = estimator.predict(X)
                current_pred = (current_pred == classes).T * weight

            if pred is None:
                pred = current_pred
            else:
                pred += current_pred

            if n_classes == 2:
                tmp_pred = np.copy(pred)
                tmp_pred[:, 0] *= -1
                yield (tmp_pred / norm).sum(axis=1)
            else:
                yield pred / norm

    @staticmethod
    def _compute_proba_from_decision(decision, n_classes):
        if n_classes == 2:
            decision = np.vstack([-decision, decision]).T / 2
        else:
            decision /= (n_classes - 1)
        return softmax(decision, copy=False)

    def predict_proba(self, X):
        check_is_fitted(self)
        X = self._check_X(X)

        n_classes = self.n_classes_

        if n_classes == 1:
            return np.ones((_num_samples(X), 1))

        decision = self.decision_function(X)
        return self._compute_proba_from_decision(decision, n_classes)

    def staged_predict_proba(self, X):
        X = self._check_X(X)

        n_classes = self.n_classes_

        for decision in self.staged_decision_function(X):
            yield self._compute_proba_from_decision(decision, n_classes)

    def predict_log_proba(self, X):
        X = self._check_X(X)
        return np.log(self.predict_proba(X))



Adaboost
----------------------------

In [None]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.model_selection import RepeatedKFold
import warnings

warnings.filterwarnings("ignore")
foz=4
imp_feature_size=1
n_estimators = 10
#Table
from prettytable import PrettyTable
    
pt_adaboost = PrettyTable()

pt_adaboost.field_names = ["Dataset", "Imp_Adaboost", "Adaboost"]

foldPreds_AB = {}
K =0
for ds in datasets:
  K+=1
  rkf = RepeatedKFold(n_splits=2, n_repeats=5, random_state=2652124)
  if ds == "iphonetweets" or ds=="hobbittweets":
    df = datasets[ds]
    X = df['text']
    Y = df['label']
  else:
    X,Y = datasets[ds]
    Y = Y.str.decode("utf-8")
    
  # X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5, random_state=42)
  print(ds)
  accuracies = []
  accuracies_imp = []  
  for train_index, test_index in rkf.split(X):

    X_train = X.iloc[train_index]
    X_test = X.iloc[test_index]
    Y_train = Y.iloc[train_index]
    Y_test = Y.iloc[test_index]

    if ds == "iphonetweets" or ds=="hobbittweets":
      vectorizer = CountVectorizer(analyzer=preprocess)
      vectorizer.fit(X_train)

      X_train = vectorizer.transform(X_train).toarray()
      X_test = vectorizer.transform(X_test).toarray()
      X_train = pd.DataFrame(X_train)
      X_test = pd.DataFrame(X_test)

    d = len(X_train.columns)
    num_class = len(Y_train.value_counts())
    
    ab_clf = AdaBoostClassifier(n_estimators=10)
    ab_clf.fit(X=X_train,y=Y_train)
    y_preds = ab_clf.predict(X_test)

    # print("-----------------------------------------------------------")
    ab_clf_imp = AdaBoostClassifier_IMP(n_estimators=10)
    ab_clf_imp.fit(X=X_train, X_test=X_test , y=Y_train)
    y_preds_imp = ab_clf_imp.predict()

    # print(accuracy_score(Y_test,y_preds))
    # print(accuracy_score(Y_test,y_preds_imp))

    

    # print("--------------  {}  ----------------".format(ds.upper()))
    accuracies.append(accuracy_score(Y_test,y_preds))
    accuracies_imp.append(accuracy_score(Y_test,y_preds_imp))
    # print(" ")
    # print(" ")
    pt_adaboost.add_row((ds, "%.4f" % accuracy_score(Y_test,y_preds_imp), "%.4f" % accuracy_score(Y_test,y_preds)))

  foldPreds_AB[ds] = accuracies
  foldPreds_AB[ds+"_imp"] = accuracies_imp


In [None]:
print(pt_adaboost)

In [9]:
np.save(ds_folderpath + "/" + 'adaboost.npy',foldPreds_AB ) 

Sonuçlar
-----------------------------

In [19]:
foldPreds_Bagging = np.load("gdrive/MyDrive/ImprovedSpace/" + 'bagging.npy',allow_pickle='TRUE').item()
from prettytable import PrettyTable
    
result_Bagging = PrettyTable()

result_Bagging.field_names = ["Dataset","Win_Improved","Loss_Improved" ,"Imp_Bagging", "Bagging"]
for ds in datasets:
  win = np.array(foldPreds_Bagging[ds]) < np.array(foldPreds_Bagging[ds+'_imp'])
  loss = np.array(foldPreds_Bagging[ds]) > np.array(foldPreds_Bagging[ds+'_imp'])
  cnt_win= np.count_nonzero(win)
  cnt_loss= np.count_nonzero(loss)
  ave= np.average(foldPreds_Bagging[ds])
  ave_imp= np.average(foldPreds_Bagging[ds+"_imp"])
  result_Bagging.add_row((ds, cnt_win,cnt_loss, "%.4f" % ave_imp, "%.4f" % ave))

print(result_Bagging)

+----------------+--------------+---------------+-------------+---------+
|    Dataset     | Win_Improved | Loss_Improved | Imp_Bagging | Bagging |
+----------------+--------------+---------------+-------------+---------+
|   ionosphere   |      8       |       1       |    0.9293   |  0.9094 |
|    diabetes    |      7       |       3       |    0.7456   |  0.7385 |
|     glass      |      3       |       7       |    0.6665   |  0.7091 |
|    abalone     |      8       |       2       |    0.2350   |  0.2293 |
| heart-statlog  |      8       |       2       |    0.8074   |  0.7837 |
|     colic      |      5       |       5       |    0.8359   |  0.8315 |
|    credit-g    |      4       |       6       |    0.7308   |  0.7340 |
| balance-scale  |      10      |       0       |    0.9274   |  0.8089 |
|     autos      |      3       |       5       |    0.6634   |  0.6644 |
|    credit-a    |      4       |       5       |    0.8470   |  0.8490 |
|    breast-w    |      9       |     

In [20]:
foldPreds_RF = np.load("gdrive/MyDrive/ImprovedSpace/" + 'RF.npy',allow_pickle='TRUE').item()
from prettytable import PrettyTable
    
result_RF = PrettyTable()

result_RF.field_names = ["Dataset","Win_Improved","Loss_Improved" ,"Imp_RF", "RF"]
for ds in datasets:
  win = np.array(foldPreds_RF[ds]) < np.array(foldPreds_RF[ds+'_imp'])
  loss = np.array(foldPreds_RF[ds]) > np.array(foldPreds_RF[ds+'_imp'])
  cnt_win= np.count_nonzero(win)
  cnt_loss= np.count_nonzero(loss)
  ave= np.average(foldPreds_RF[ds])
  ave_imp= np.average(foldPreds_RF[ds+"_imp"])
  result_RF.add_row((ds, cnt_win, cnt_loss, "%.4f" % ave_imp, "%.4f" % ave))

print(result_RF)

+----------------+--------------+---------------+--------+--------+
|    Dataset     | Win_Improved | Loss_Improved | Imp_RF |   RF   |
+----------------+--------------+---------------+--------+--------+
|   ionosphere   |      7       |       2       | 0.9310 | 0.9168 |
|    diabetes    |      9       |       1       | 0.7440 | 0.7320 |
|     glass      |      6       |       3       | 0.7045 | 0.7014 |
|    abalone     |      5       |       5       | 0.2276 | 0.2270 |
| heart-statlog  |      7       |       3       | 0.8037 | 0.7874 |
|     colic      |      1       |       6       | 0.8082 | 0.8158 |
|    credit-g    |      7       |       3       | 0.7378 | 0.7282 |
| balance-scale  |      10      |       0       | 0.9261 | 0.8038 |
|     autos      |      4       |       6       | 0.6614 | 0.6683 |
|    credit-a    |      4       |       5       | 0.8583 | 0.8606 |
|    breast-w    |      9       |       1       | 0.9642 | 0.9548 |
| breast-cancer  |      4       |       6       

In [21]:
foldPreds_AB = np.load("gdrive/MyDrive/ImprovedSpace/" + 'adaboost.npy',allow_pickle='TRUE').item()
from prettytable import PrettyTable
    
result_AB = PrettyTable()

result_AB.field_names = ["Dataset","Win_Improved","Loss_Improved" ,"Imp_Adaboost", "Adaboost"]
for ds in datasets:
  win = np.array(foldPreds_AB[ds]) < np.array(foldPreds_AB[ds+'_imp'])
  loss = np.array(foldPreds_AB[ds]) > np.array(foldPreds_AB[ds+'_imp'])
  cnt_win= np.count_nonzero(win)
  cnt_loss= np.count_nonzero(loss)
  ave= np.average(foldPreds_AB[ds])
  ave_imp= np.average(foldPreds_AB[ds+"_imp"])
  result_AB.add_row((ds, cnt_win,cnt_loss, "%.4f" % ave_imp, "%.4f" % ave))

print(result_AB)

+----------------+--------------+---------------+--------------+----------+
|    Dataset     | Win_Improved | Loss_Improved | Imp_Adaboost | Adaboost |
+----------------+--------------+---------------+--------------+----------+
|   ionosphere   |      6       |       1       |    0.9168    |  0.9048  |
|    diabetes    |      7       |       3       |    0.7495    |  0.7375  |
|     glass      |      9       |       1       |    0.5005    |  0.4294  |
|    abalone     |      6       |       4       |    0.2199    |  0.2225  |
| heart-statlog  |      4       |       5       |    0.7859    |  0.7926  |
|     colic      |      2       |       7       |    0.7804    |  0.7880  |
|    credit-g    |      3       |       7       |    0.7070    |  0.7140  |
| balance-scale  |      6       |       4       |    0.8220    |  0.8518  |
|     autos      |      7       |       3       |    0.4505    |  0.4267  |
|    credit-a    |      5       |       5       |    0.8414    |  0.8397  |
|    breast-