In [None]:
import random
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_boston , fetch_20newsgroups , load_wine
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import HashingVectorizer ,TfidfVectorizer


SEED = 2018
random.seed(SEED)
np.random.seed(SEED)

#==============================================================================
# Data 
#==============================================================================

# dataset = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
cats =['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware']
dataset = fetch_20newsgroups(subset='all', categories=cats,shuffle=True, random_state=42)

X1, y = dataset.data, dataset.target
# vectorizer = HashingVectorizer(stop_words='english', non_negative=True)
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.3, stop_words='english',smooth_idf =True)


X = vectorizer.fit_transform(X1)
features = features = dataset.target_names


# dataset = load_wine()
# X, y = dataset.data, dataset.target
# features = dataset.feature_names

#==============================================================================
# CV MSE before feature selection
#==============================================================================
# est = LinearRegression()
est = MultinomialNB(alpha=.01)
# score = -1.0 * cross_val_score(est, X, y, cv=5, scoring="neg_mean_squared_error")
# score = -1.0 *cross_val_score(est, X, y, cv=5, scoring='accuracy').mean()
# print("CV MSE before feature selection: {:.2f}".format(np.mean(score)))

# est.fit(X=train_data, y=train_labels)
print ("MultinomialNB 10-Cross Validation accuracy:",-1.0 *cross_val_score(est, X, y, cv=5, scoring='accuracy').mean())
#==============================================================================
# Class performing feature selection with genetic algorithm
#==============================================================================
class GeneticSelector():
    def __init__(self, estimator, n_gen, size, n_best, n_rand, 
                 n_children, mutation_rate):
        # Estimator 
        self.estimator = estimator
        # Number of generations
        self.n_gen = n_gen
        # Number of chromosomes in population
        self.size = size
        # Number of best chromosomes to select
        self.n_best = n_best
        # Number of random chromosomes to select
        self.n_rand = n_rand
        # Number of children created during crossover
        self.n_children = n_children
        # Probablity of chromosome mutation
        self.mutation_rate = mutation_rate
        
        if int((self.n_best + self.n_rand) / 2) * self.n_children != self.size:
            raise ValueError("The population size is not stable.")  
            
    def initilize(self):
        population = []
        for i in range(self.size):
            chromosome = np.ones(self.n_features, dtype=np.bool)
            mask = np.random.rand(len(chromosome)) < 0.3
            chromosome[mask] = False
            population.append(chromosome)
        return population

    def fitness(self, population):
        X, y = self.dataset
        scores = []
        for chromosome in population:
#             score = -1.0 * np.mean(cross_val_score(self.estimator, X[:,chromosome], y, 
#                                                        cv=5, 
#                                                        scoring="neg_mean_squared_error"))

#             self.estimator.fit(X=train_data, y=train_labels)
            score = -1.0 *cross_val_score(self.estimator, X[:,chromosome], y, cv=5, scoring='accuracy').mean()
#             print ("MultinomialNB 10-Cross Validation accuracy:",cross_val_score(self.estimator, X[:,sel.support_], y, cv=5, scoring='accuracy').mean())
            scores.append(score)
        scores, population = np.array(scores), np.array(population) 
        inds = np.argsort(scores)
        return list(scores[inds]), list(population[inds,:])

    def select(self, population_sorted):
        
        population_next = []
        for i in range(self.n_best):
            population_next.append(population_sorted[i])
        for i in range(self.n_rand):
            population_next.append(random.choice(population_sorted))
        random.shuffle(population_next)
        return population_next

    def crossover(self, population):
        population_next = []
        for i in range(int(len(population)/2)):
            for j in range(self.n_children):
                chromosome1, chromosome2 = population[i], population[len(population)-1-i]
                child = chromosome1
                mask = np.random.rand(len(child)) > 0.5
                child[mask] = chromosome2[mask]
                population_next.append(child)
        return population_next
	
    def mutate(self, population):
        population_next = []
        for i in range(len(population)):
            chromosome = population[i]
            if random.random() < self.mutation_rate:
                mask = np.random.rand(len(chromosome)) < 0.05
                chromosome[mask] = False
            population_next.append(chromosome)
        return population_next

    def generate(self, population):
        # Selection, crossover and mutation
        scores_sorted, population_sorted = self.fitness(population)
        population = self.select(population_sorted)
        population = self.crossover(population)
        population = self.mutate(population)
        # History
        self.chromosomes_best.append(population_sorted[0])
        self.scores_best.append(scores_sorted[0])
        print("score : ", scores_sorted[0])
        self.scores_avg.append(np.mean(scores_sorted))
        
        return population

    def fit(self, X, y):
 
        self.chromosomes_best = []
        self.scores_best, self.scores_avg  = [], []
        
        self.dataset = X, y
        self.n_features = X.shape[1]
        
        population = self.initilize()
        for i in range(self.n_gen):
            population = self.generate(population)
            
        return self 
    
    @property
    def support_(self):
        return self.chromosomes_best[-1]

    def plot_scores(self):
        plt.plot(self.scores_best, label='Best')
        plt.plot(self.scores_avg, label='Average')
        plt.legend()
        plt.ylabel('Scores')
        plt.xlabel('Generation')
        plt.show()
# (self.n_best + self.n_rand) / 2) * self.n_children != self.size
sel = GeneticSelector(estimator=MultinomialNB(alpha=.3), 
                       n_gen=100, size=100, n_best=10, n_rand=40, n_children=4, mutation_rate=0.01)
sel.fit(X, y)
sel.plot_scores()
# print("chromosomes_best: ",sel.chromosomes_best)
# # score = -1.0 * cross_val_score(est, X[:,sel.support_], y, cv=5, scoring="neg_mean_squared_error")
# print("CV MSE after feature selection: {:.2f}".format(np.mean(score)))
# accuracy = -1.0 *cross_val_score(est, X[:,sel.support_], y, scoring='accuracy', cv = 10).mean()
# # print(accuracy)
# print ("MultinomialNB 10-Cross Validation accuracy:",accuracy)
score = -1.0 *cross_val_score(est, X[:,sel.support_], y, scoring='accuracy', cv = 5)
print("CV MSE after feature selection: {:.2f}".format(np.mean(score)))

MultinomialNB 10-Cross Validation accuracy: -0.9106842643537906
score :  -0.9187042363608621
score :  -0.9192343492156997
score :  -0.9192368519143086
score :  -0.9189709030351952
score :  -0.9197673169609523
score :  -0.9208375544246821
score :  -0.9208375544246821
score :  -0.920035055533863
score :  -0.920035055533863
score :  -0.9203017250755237
score :  -0.9211042239663426
score :  -0.921903501407557
score :  -0.9224396944370475
score :  -0.9227056433161607
score :  -0.9227095777782633
score :  -0.9235077852247645
score :  -0.9237751649039089
score :  -0.9240425436272777
score :  -0.9243070626659786
score :  -0.924577303941342
score :  -0.924577303941342
score :  -0.9243102812444128
score :  -0.9243099233064219
score :  -0.9243099233064219
score :  -0.9243099233064219
score :  -0.924576946003351
score :  -0.924576946003351
score :  -0.9248453985406924
score :  -0.9248453985406924
score :  -0.9248446836204863
score :  -0.9248446836204863
score :  -0.9248446836204863
score :  -0.924

In [None]:
n_gen=100, size=100, n_best=10, n_rand=40, n_children=4, mutation_rate=0.01)
-0.9275202739326742