In [None]:
import numpy
import sklearn.svm
import numpy as np

def reduce_features(solution, features):
    selected_elements_indices = numpy.where(solution == 1)[0]
    reduced_features = features[:, selected_elements_indices]
    return reduced_features


def classification_accuracy(labels, predictions):
    correct = numpy.where(labels == predictions)[0]
    accuracy = correct.shape[0]/labels.shape[0]
    return accuracy


def cal_pop_fitness(pop, features, labels, train_indices, test_indices):
    accuracies = numpy.zeros(pop.shape[0])
    idx = 0

    for curr_solution in pop:
        reduced_features = reduce_features(curr_solution, features)
        train_data = reduced_features[train_indices, :]
        test_data = reduced_features[test_indices, :]
        print ("train_indices::: ", train_indices)
        print ("labels[train_indices] : ", labels[train_indices])
        train_labels = labels[train_indices]
        test_labels = labels[test_indices]

        SV_classifier = sklearn.svm.SVC(gamma='scale')
        SV_classifier.fit(X=train_data, y=train_labels)

        predictions = SV_classifier.predict(test_data)
        accuracies[idx] = classification_accuracy(test_labels, predictions)
        idx = idx + 1
    return accuracies

def select_mating_pool(pop, fitness, num_parents):
    # Selecting the best individuals in the current generation as parents for producing the offspring of the next generation.
    parents = numpy.empty((num_parents, pop.shape[1]))
    for parent_num in range(num_parents):
        max_fitness_idx = numpy.where(fitness == numpy.max(fitness))
        max_fitness_idx = max_fitness_idx[0][0]
        parents[parent_num, :] = pop[max_fitness_idx, :]
        fitness[max_fitness_idx] = -99999999999
    return parents


def crossover(parents, offspring_size):
    offspring = numpy.empty(offspring_size)
    # The point at which crossover takes place between two parents. Usually, it is at the center.
    crossover_point = numpy.uint8(offspring_size[1]/2)

    for k in range(offspring_size[0]):
        # Index of the first parent to mate.
        parent1_idx = k%parents.shape[0]
        # Index of the second parent to mate.
        parent2_idx = (k+1)%parents.shape[0]
        # The new offspring will have its first half of its genes taken from the first parent.
        offspring[k, 0:crossover_point] = parents[parent1_idx, 0:crossover_point]
        # The new offspring will have its second half of its genes taken from the second parent.
        offspring[k, crossover_point:] = parents[parent2_idx, crossover_point:]
    return offspring


def mutation(offspring_crossover, num_mutations=2):
    mutation_idx = numpy.random.randint(low=0, high=offspring_crossover.shape[1], size=num_mutations)
    # Mutation changes a single gene in each offspring randomly.
    for idx in range(offspring_crossover.shape[0]):
        # The random value to be added to the gene.
        offspring_crossover[idx, mutation_idx] = 1 - offspring_crossover[idx, mutation_idx]
    return offspring_crossover

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer , CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
# from sklearn.datasets import fetch_20newsgroups_vectorized 

twenty_train = fetch_20newsgroups(subset='train', shuffle=True)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data).toarray()
print (len(twenty_train.data))
print (X_train_counts.shape)
# twenty_train.data.

In [None]:


vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(newsgroups_train.data)
y = newsgroups_train.target
Xtest = vectorizer.transform(newsgroups_test.data)
ytest = newsgroups_test.target
print(y.shape)

In [None]:
import numpy
# import GA
import pickle
import matplotlib.pyplot

newsgroups = fetch_20newsgroups(subset='all')
categories = list(newsgroups.target_names)
newsgroups_train = fetch_20newsgroups(subset='train',categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)

# f = open("dataset_features.pkl", "rb")
data_inputs = newsgroups_train.data
# f.close()

# f = open("outputs.pkl", "rb")
data_outputs = categories
# f.close()

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
data_inputs = X_train_counts
num_samples = data_inputs.shape[0]
num_feature_elements = data_inputs.shape[1]

train_indices = numpy.arange(1, num_samples, 4)
print ("train_indices: " , train_indices)
test_indices = numpy.arange(0, num_samples, 4)
print("Number of training samples: ", train_indices.shape[0])
print("Number of test samples: ", test_indices.shape[0])

"""
Genetic algorithm parameters:
    Population size
    Mating pool size
    Number of mutations
"""
sol_per_pop = 8 # Population size.
num_parents_mating = 4 # Number of parents inside the mating pool.
num_mutations = 3 # Number of elements to mutate.

# Defining the population shape.
pop_shape = (sol_per_pop, num_feature_elements)

# Creating the initial population.
new_population = numpy.random.randint(low=0, high=2, size=pop_shape)
print(new_population.shape)

best_outputs = []
num_generations = 100
for generation in range(num_generations):
    print("Generation : ", generation)
    # Measuring the fitness of each chromosome in the population.
    print ("data_outputs: " , data_outputs)
    fitness = cal_pop_fitness(new_population, data_inputs, data_outputs, train_indices, test_indices)

    best_outputs.append(numpy.max(fitness))
    # The best result in the current iteration.
    print("Best result : ", best_outputs[-1])

    # Selecting the best parents in the population for mating.
    parents = GA.select_mating_pool(new_population, fitness, num_parents_mating)

    # Generating next generation using crossover.
    offspring_crossover = GA.crossover(parents, offspring_size=(pop_shape[0]-parents.shape[0], num_feature_elements))

    # Adding some variations to the offspring using mutation.
    offspring_mutation = GA.mutation(offspring_crossover, num_mutations=num_mutations)

    # Creating the new population based on the parents and offspring.
    new_population[0:parents.shape[0], :] = parents
    new_population[parents.shape[0]:, :] = offspring_mutation

# Getting the best solution after iterating finishing all generations.
# At first, the fitness is calculated for each solution in the final generation.
fitness = GA.cal_pop_fitness(new_population, data_inputs, data_outputs, train_indices, test_indices)
# Then return the index of that solution corresponding to the best fitness.
best_match_idx = numpy.where(fitness == numpy.max(fitness))[0]
best_match_idx = best_match_idx[0]

best_solution = new_population[best_match_idx, :]
best_solution_indices = numpy.where(best_solution == 1)[0]
best_solution_num_elements = best_solution_indices.shape[0]
best_solution_fitness = fitness[best_match_idx]

print("best_match_idx : ", best_match_idx)
print("best_solution : ", best_solution)
print("Selected indices : ", best_solution_indices)
print("Number of selected elements : ", best_solution_num_elements)
print("Best solution fitness : ", best_solution_fitness)

matplotlib.pyplot.plot(best_outputs)
matplotlib.pyplot.xlabel("Iteration")
matplotlib.pyplot.ylabel("Fitness")
matplotlib.pyplot.show()

In [None]:
# print (num_samples)
# print (train_indices)
newsgroups_train.data

In [None]:
print (X_train_counts)
print (X_train_counts.shape[0])
In [274]: dt=np.dtype('int,float')

In [275]: np.array(xlist,dtype=dt)
# data_inputs = np.array(X_train_counts)


In [None]:
print (dataset.data[1])
print (dataset.target[1])

In [None]:
# get dataframe path, population number and generation number from command-line argument

from sklearn.datasets import fetch_20newsgroups_vectorized
# newsgroups = fetch_20newsgroups_vectorized(subset='all')
# newsgroups = fetch_20newsgroups_vectorized(subset='test')
newsgroups = fetch_20newsgroups_vectorized(subset='train')

categories = list(newsgroups.target_names)
dataframePath = newsgroups
n_pop = 20
n_gen = 4

# read dataframe from csv
df = newsgroups

# encode labels column to numbers
le = categories
le.fit(df.iloc[:, -1])
y = le.transform(df.iloc[:, -1])
X = categories

# get accuracy with all features
individual = [1 for i in range(len(X.columns))]
print("Accuracy with all features: \t" +
      str(getFitness(individual, X, y)) + "\n")

# apply genetic algorithm
hof = geneticAlgorithm(X, y, n_pop, n_gen)

# select the best individual
accuracy, individual, header = bestIndividual(hof, X, y)
print('Best Accuracy: \t' + str(accuracy))
print('Number of Features in Subset: \t' + str(individual.count(1)))
print('Individual: \t\t' + str(individual))
print('Feature Subset\t: ' + str(header))

print('\n\ncreating a new classifier with the result')

# read dataframe from csv one more time
# df = pd.read_csv(dataframePath, sep=',')

# with feature subset
X = df[header]

clf = LogisticRegression()

scores = cross_val_score(clf, X, y, cv=5)
print("Accuracy with Feature Subset: \t" + str(avg(scores)) + "\n")

In [None]:
import pandas as pd
import numpy as np
import random
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from deap import creator, base, tools, algorithms
import sys


def avg(l):
    """
    Returns the average between list elements
    """
    return (sum(l)/float(len(l)))


def getFitness(individual, X, y):
    """
    Feature subset fitness function
    """

    if(individual.count(0) != len(individual)):
        # get index with value 0
        cols = [index for index in range(
            len(individual)) if individual[index] == 0]

        # get features subset
        X_parsed = X.drop(X.columns[cols], axis=1)
        X_subset = pd.get_dummies(X_parsed)

        # apply classification algorithm
        clf = LogisticRegression()

        return (avg(cross_val_score(clf, X_subset, y, cv=5)),)
    else:
        return(0,)


def geneticAlgorithm(X, y, n_population, n_generation):
    """
    Deap global variables
    Initialize variables to use eaSimple
    """
    # create individual
    creator.create("FitnessMax", base.Fitness, weights=(1.0,))
    creator.create("Individual", list, fitness=creator.FitnessMax)

    # create toolbox
    toolbox = base.Toolbox()
    toolbox.register("attr_bool", random.randint, 0, 1)
    toolbox.register("individual", tools.initRepeat,
                     creator.Individual, toolbox.attr_bool, len(X.columns))
    toolbox.register("population", tools.initRepeat, list,
                     toolbox.individual)
    toolbox.register("evaluate", getFitness, X=X, y=y)
    toolbox.register("mate", tools.cxOnePoint)
    toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
    toolbox.register("select", tools.selTournament, tournsize=3)

    # initialize parameters
    pop = toolbox.population(n=n_population)
    hof = tools.HallOfFame(n_population * n_generation)
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("avg", np.mean)
    stats.register("min", np.min)
    stats.register("max", np.max)

    # genetic algorithm
    pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2,
                                   ngen=n_generation, stats=stats, halloffame=hof,
                                   verbose=True)

    # return hall of fame
    return hof


def bestIndividual(hof, X, y):
    """
    Get the best individual
    """
    maxAccurcy = 0.0
    for individual in hof:
        if(individual.fitness.values > maxAccurcy):
            maxAccurcy = individual.fitness.values
            _individual = individual

    _individualHeader = [list(X)[i] for i in range(
        len(_individual)) if _individual[i] == 1]
    return _individual.fitness.values, _individual, _individualHeader


def getArguments():
    """
    Get argumments from command-line
    If pass only dataframe path, pop and gen will be default
    """
    dfPath = sys.argv[1]
    if(len(sys.argv) == 4):
        pop = int(sys.argv[2])
        gen = int(sys.argv[3])
    else:
        pop = 10
        gen = 2
    return dfPath, pop, gen




In [None]:
import random
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_boston
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer , CountVectorizer , HashingVectorizer
from sklearn.datasets import fetch_20newsgroups_vectorized
from sklearn.naive_bayes import MultinomialNB

import pandas as pd
# newsgroups = fetch_20newsgroups_vectorized(subset='all')
# newsgroups = fetch_20newsgroups_vectorized(subset='test')
newsgroups = fetch_20newsgroups_vectorized(subset='train')

SEED = 2018
random.seed(SEED)
np.random.seed(SEED)

#==============================================================================
# Data 
#==============================================================================
dataset = newsgroups


#==============================================================================
# CV MSE before feature selection
#==============================================================================
# est = LinearRegression()

categories = None
# data_train = fetch_20newsgroups_vectorized(subset='train', remove=('headers', 'footers', 'quotes'))
# data_test = fetch_20newsgroups_vectorized(subset='test', remove=('headers', 'footers', 'quotes'))
# data_all = fetch_20newsgroups_vectorized(subset='all', remove=('headers', 'footers', 'quotes'))

# store training feature matrix in "Xtr"
# Xtr = data_train.data
# print ("Xtr:\n", Xtr)

# store training response vector in "ytr"
# ytr = data_train.target
# print ("ytr:",ytr)

# store testing feature matrix in "Xtt"
# Xtt = data_train.data
# print ("Xtt:\n", Xtt)

# store testing response vector in "ytt"
# ytt = data_train.target
# print ("ytt:",ytt)

# store all feature matrix in "Xtr"
# X = data_all.data
# print ("X:\n", X.shape)

# store training response vector in "ytr"
# y = data_all.target
# print ("ytr:",ytr)

categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']

# data = fetch_20newsgroups(subset='all')
data_train = fetch_20newsgroups(subset='train',  categories=categories, remove=('headers', 'footers', 'quotes'))
data_test = fetch_20newsgroups(subset='test', categories=categories, remove=('headers', 'footers', 'quotes'))

y_train, y_test = data_train.target, data_test.target
vectorizer = HashingVectorizer(stop_words='english', non_negative=True)
X_train = vectorizer.transform(data_train.data)
X_test = vectorizer.transform(data_test.data)

#----------
clf = MultinomialNB(alpha=.01)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy before feature selection:   %0.3f" % score)
#---------

# vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.3, stop_words='english',smooth_idf =True)
# data_train_vectors = vectorizer.fit_transform(data_train.data)
# # data_test_vectors = vectorizer.transform(data_test.data) 
# X = data_train_vectors
# y = data_train.target


# --------------------------------------

# Xtt = data_test_vectors
# ytt = data_test.target

# clf_mnb = MultinomialNB(alpha=.01)
# clf_mnb.fit(X, y)
# y_pred_mnb = clf_mnb.predict(Xtt)
# print ("Classification Accuracy:",metrics.accuracy_score(ytt, y_pred_mnb))
# --------------------------------------
# clf_mnb = MultinomialNB(alpha=.01)
## print ("MultinomialNB 10-Cross Validation Score before feature selection:",cross_val_score(clf_mnb, X, y, cv=5, scoring='accuracy').mean())

# pred = clf_mnb.predict(Xtr)
# m = metrics.f1_score(ytr, pred, average='macro')
# print("CV metrics f1 before feature selection: {:.2f}".format(m))
#     score = cross_validation.cross_val_score( clf, X.toarray(), y, cv=5)
# score = -1.0 * cross_val_score(clf_mnb, X, y, cv=5, scoring='neg_mean_squared_error')
# print("CV MSE before feature selection: {:.2f}".format(np.mean(score)))

#==============================================================================
# Class performing feature selection with genetic algorithm
#==============================================================================
# (estimator=clf_mnb, n_gen=4, size=50, n_best=10, n_rand=40, n_children=2, mutation_rate=0.05)
class GeneticSelector():
    def __init__(self, estimator, n_gen, size, n_best, n_rand, 
                 n_children, mutation_rate):
        
        print ("__init__: ")
        # Estimator 
        self.estimator = estimator
        # Number of generations
        self.n_gen = n_gen
        # Number of chromosomes in population
        self.size = size
        # Number of best chromosomes to select
        self.n_best = n_best
        # Number of random chromosomes to select
        self.n_rand = n_rand
        # Number of children created during crossover
        self.n_children = n_children
        # Probablity of chromosome mutation
        self.mutation_rate = mutation_rate
        if int((self.n_best + self.n_rand) / 2) * self.n_children != self.size:
            raise ValueError("The population size is not stable.")  
            
# ------------------------------
# i :  0
# chromosome shape:  (101631,)
# mask:  [False  True False ... False False False]
# mask shape:  (101631,)
# chromosome[mask]:  [False False False ... False False False]
# chromosome[mask] shape:  (30494,)71082 10329

# i :  0
# chromosome shape:  (101631,)
# np.random.rand(len(chromosome)):  [0.96728762 0.29321817 0.06291302 ... 0.09776348 0.70959122 0.86756446]
# np.random.rand(len(chromosome)) < 0.3:  [False False False ... False  True False]
# mask:  [False False False ... False False False]
# mask [mask !=False]) :  [ True  True  True ...  True  True  True]
# mask shape:  (101631,)
# chromosome[mask]:  [False False False ... False False False]
# chromosome:  (101631,)
# chromosome[mask] shape:  (5047,)

# i :  0
# chromosome shape:  (101631,)
# chromosome[chromosome !=False] :  (101631,)
# mask [mask !=False]) :  (5047,)
# mask shape:  (101631,)
# (chromosome[mask]!=False).shape:  (5047,)
# chromosome[mask]:  [False False False ... False False False]
# chromosome:  (101631,)
# chromosome[mask] shape:  (5047,)
# chromosome[chromosome !=False] :  (96584,)
    
# ------------------------------
    def initilize(self):
#         print ("initilize: ")
        population = []
        for i in range(self.size):
            chromosome = np.ones(self.n_features, dtype=np.bool)
            # each chromosome has 113000 size /  chromosome is document / pop = 100 chromosome
#             print ("self.n_features: ", self.n_features)
#             print ("chromosome shape: ", chromosome.shape)
#             print ("chromosome[chromosome !=False] : ", chromosome[chromosome !=False].shape)
    
            mask = np.random.rand(len(chromosome)) < 0.3  # Create an array of the given shape and populate it with random samples
            
#             print ("mask [mask !=False]) : ", mask[mask !=False].shape)
#             print ("mask shape: ", mask.shape)
            
            chromosome[mask] = False
#             print ("(chromosome[mask]!=False).shape: ", (chromosome[mask]!=False).shape)
#             print ("chromosome[mask]: ", chromosome[mask])
            
#             print ("chromosome: ", chromosome.shape)
#             print ("chromosome[mask] shape: ", chromosome[mask].shape)
#             print ("chromosome[chromosome !=False] : ", chromosome[chromosome !=False].shape)
            population.append(chromosome)
#             print ("population: ", population)
        return population

    def fitness(self, population):
#         print ("fitness: ")
        X, y = self.dataset
#         print ("X: ",X)
#         print ("X.shape: ",X.shape)
#         print ("y.shape: ",y.shape)
        scores = []
        for chromosome in population:
#             print ("chromosome: ",chromosome)
#             print ("chromosome.shape: ",chromosome.shape)
#             print ("chromosome[chromosome !=False].shape: ",chromosome[chromosome !=False].shape)
#             print ("X[:,chromosome]: ",X[:,chromosome])
#             print ("X[:,].shape: ",X[:,].shape)
#             print ("X[,].shape: ",X[0,].shape)
#             score = X[:,chromosome].mean()
            score = -1.0 * np.mean(cross_val_score(self.estimator, X[:,chromosome], y, cv=5, scoring="neg_mean_squared_error"))
            scores.append(score)
        
#         print ("scores.shape: ",len(scores))
#         print ("scores: ",scores)
        scores, population = np.array(scores), np.array(population) 
        inds = np.argsort(scores)
#         print ("inds: ",inds)
#         print("list(scores[inds]): ",list(scores[inds]))
#         print("population[inds,:]: ",population[inds,:])
#         print("list(population[inds,:]: ",list(population[inds,:]))
        return list(scores[inds]), list(population[inds,:])

    def select(self, population_sorted):
#         print ("select: ")
        population_next = []
        for i in range(self.n_best):
            population_next.append(population_sorted[i])
        for i in range(self.n_rand):
            population_next.append(random.choice(population_sorted))
        random.shuffle(population_next)
        return population_next

    def crossover(self, population):
#         print ("crossover: ")
        population_next = []
        for i in range(int(len(population)/2)):
            for j in range(self.n_children):
                chromosome1, chromosome2 = population[i], population[len(population)-1-i]
                child = chromosome1
                mask = np.random.rand(len(child)) > 0.5
                child[mask] = chromosome2[mask]
                population_next.append(child)
#         print (len(population_next))
        return population_next
	
    def mutate(self, population):
#         print ("mutate: ")
        population_next = []
        for i in range(len(population)):
            chromosome = population[i]
            if random.random() < self.mutation_rate:
                mask = np.random.rand(len(chromosome)) < 0.05
                chromosome[mask] = False
            population_next.append(chromosome)
        return population_next

    def generate(self, population):
#         print ("generate: ")
        # Selection, crossover and mutation
        scores_sorted, population_sorted = self.fitness(population)
        population = self.select(population_sorted)
        population = self.crossover(population)
        population = self.mutate(population)
        # History
        self.chromosomes_best.append(population_sorted[0])
        self.scores_best.append(scores_sorted[0])
        self.scores_avg.append(np.mean(scores_sorted))
        
        return population

    def fit(self, X, y):
#         print ("fit: ")
 
        self.chromosomes_best = []
        self.scores_best, self.scores_avg  = [], []
        
        self.dataset = X, y
        self.n_features = X.shape[1]
        # dfine number of features
        self.n_features
        population = self.initilize()  # 100 chromosome 
        for i in range(self.n_gen): # each pop generation has 4 
            population = self.generate(population) 
#             XX.append(population)
        return self 
    
    @property
    def support_(self):
#         print ("self.chromosomes_best[-1]: ",self.chromosomes_best[-1])
        return self.chromosomes_best[-1]

    def plot_scores(self):
        plt.plot(self.scores_best, label='Best')
        plt.plot(self.scores_avg, label='Average')
        plt.legend()
        plt.ylabel('Scores')
        plt.xlabel('Generation')
        plt.show()
        


# do2 = []
# XX = []
# for i in range(X.shape[0]//500):
#     doc = X[i,]
#     print ("do1",doc.shape)

clf_mnb = MultinomialNB(alpha=.01) 

# XX = show_top10(clf_mnb, vectorizer, y)

sel = GeneticSelector(estimator=clf_mnb,n_gen=7, size=200, n_best=40, n_rand=40, n_children=5, mutation_rate=0.05)

sel.fit(X_train, y_train)
#     print (XX[i])
#     XX.append(population)
#     for i in range(do.shape[1]):
# #         if do[:,i] > 0:
#         do2.append(do[:,i])
#     #         print ("feature ",X[0,i])
#     print ("do2",doc.shape)
#     do2 = []



# XX.plot_scores()

# print (len(XX))

# --------------------------------------
# clf_mnb = MultinomialNB(alpha=.1)

# # Fit the model with data (aka "model training")
# clf_mnb.fit(X[XX], y)

# # Predict the response for a new observation
# y_pred = clf_mnb.predict(Xt)
# print ("Predicted Class Labels:",y_pred)

# score = metrics.accuracy_score(yt, y_pred)
# print("CV MSE before feature selection: {:.2f}".format(score))
# --------------------------------------

#================
clf = MultinomialNB(alpha=.01)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy after feature selection:   %0.3f" % score)
#=================
# 1048576

# score = -1.0 * cross_val_score(clf_mnb, X[:,sel.support_], y, cv=5, scoring="neg_mean_squared_error")
# print("CV MSE after feature selection: {:.2f}".format(np.mean(score))) 
# print ("MultinomialNB 10-Cross Validation Score before feature selection:",cross_val_score(clf_mnb, XX, y, cv=5, scoring='accuracy').mean())

In [None]:
CV MSE before feature selection: 3.98
__init__: 
CV MSE after feature selection: 4.60
    
CV MSE before feature selection: 3.98
__init__: 
CV MSE after feature selection: 6.33
    
    
CV MSE before feature selection: 3.98
__init__: 
CV MSE after feature selection: 6.86
In [17]:

* size =50 , features//3
CV MSE before feature selection: 3.98
__init__: 
CV MSE after feature selection: 10.70


*n_gen=4, size=1000, n_best=16, n_rand=4, n_children=100 , features//3
CV MSE before feature selection: 3.98
__init__: 
CV MSE after feature selection: 12.98

*n_gen=4, size=1000, n_best=16, n_rand=4, n_children=100, mutation_rate=0.05 , n_features//2
CV MSE before feature selection: 3.98
__init__: 
CV MSE after feature selection: 8.35

In [None]:
CV MSE after feature selection: 18.10
CV MSE after feature selection: 14.82
    
CV MSE after feature selection: 14.39
chromosome[mask] shape:  (30494,)

In [None]:
score = -1.0 * cross_val_score(clf_mnb, X[:,sel.support_], y, cv=10, scoring='accuracy')
print("CV MSE before feature selection: {:.2f}".format(np.mean(score)))

In [None]:
X[:,chromosome]

In [None]:
v = X.toarray()
print (v[2938])

In [None]:
def show_top10(classifier, vectorizer, categories):
    classifier.fit(X, y)
    y_pred = classifier.predict(X)
    print ("Predicted Class Labels:",y_pred.shape)
    
    print ("vectorizer.get_feature_names(): ", len (vectorizer.get_feature_names()))
    data_train_vectors = vectorizer.fit_transform(data_train.data)
    print ("data_train_vectors: ", data_train_vectors.shape)
    feature_names = np.asarray(vectorizer.get_feature_names())
    arr = []
    for i, category in enumerate(categories):
#         top10 = np.argsort(classifier.coef_[i])[-10:]
        labels = enumerate(y_pred)
        ll = []
        count =0
        for j, l in labels:
#             print ("j: ", j)
#             print ("l: ", l)
            if l == i and count <10:
                ll.append(j)
                count = count + 1
#         print ("ll: ", ll)
        if ll != []:
            arr.append(ll)
#             print ("arr: ", arr)
    return arr

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.naive_bayes import MultinomialNB
# dataset = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

X, y = dataset.data, dataset.target
vectorizer = HashingVectorizer(stop_words='english', non_negative=True)
X_vectorized = vectorizer.transform(X)
features = features = dataset.target_names
dataset = load_boston()
X, y = dataset.data, dataset.target
features = dataset.feature_names

est = MultinomialNB(alpha=.01)

In [None]:
import random
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_boston , fetch_20newsgroups , load_wine
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import HashingVectorizer ,TfidfVectorizer


SEED = 2018
random.seed(SEED)
np.random.seed(SEED)

#==============================================================================
# Data 
#==============================================================================

# dataset = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
cats =['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware']
dataset = fetch_20newsgroups(subset='all', categories=cats,shuffle=True, random_state=42)

X1, y = dataset.data, dataset.target
# vectorizer = HashingVectorizer(stop_words='english', non_negative=True)
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.3, stop_words='english',smooth_idf =True)


X = vectorizer.fit_transform(X1)
features = features = dataset.target_names


# dataset = load_wine()
# X, y = dataset.data, dataset.target
# features = dataset.feature_names

#==============================================================================
# CV MSE before feature selection
#==============================================================================
# est = LinearRegression()
est = MultinomialNB(alpha=.01)
# score = -1.0 * cross_val_score(est, X, y, cv=5, scoring="neg_mean_squared_error")
# score = -1.0 *cross_val_score(est, X, y, cv=5, scoring='accuracy').mean()
# print("CV MSE before feature selection: {:.2f}".format(np.mean(score)))

# est.fit(X=train_data, y=train_labels)
print ("MultinomialNB 10-Cross Validation accuracy:",-1.0 *cross_val_score(est, X, y, cv=5, scoring='accuracy').mean())
#==============================================================================
# Class performing feature selection with genetic algorithm
#==============================================================================
class GeneticSelector():
    def __init__(self, estimator, n_gen, size, n_best, n_rand, 
                 n_children, mutation_rate):
        # Estimator 
        self.estimator = estimator
        # Number of generations
        self.n_gen = n_gen
        # Number of chromosomes in population
        self.size = size
        # Number of best chromosomes to select
        self.n_best = n_best
        # Number of random chromosomes to select
        self.n_rand = n_rand
        # Number of children created during crossover
        self.n_children = n_children
        # Probablity of chromosome mutation
        self.mutation_rate = mutation_rate
        
        if int((self.n_best + self.n_rand) / 2) * self.n_children != self.size:
            raise ValueError("The population size is not stable.")  
            
    def initilize(self):
        population = []
        for i in range(self.size):
            chromosome = np.ones(self.n_features, dtype=np.bool)
            mask = np.random.rand(len(chromosome)) < 0.3
            chromosome[mask] = False
            population.append(chromosome)
        return population

    def fitness(self, population):
        X, y = self.dataset
        scores = []
        for chromosome in population:
#             score = -1.0 * np.mean(cross_val_score(self.estimator, X[:,chromosome], y, 
#                                                        cv=5, 
#                                                        scoring="neg_mean_squared_error"))

#             self.estimator.fit(X=train_data, y=train_labels)
            score = -1.0 *cross_val_score(self.estimator, X[:,chromosome], y, cv=5, scoring='accuracy').mean()
#             print ("MultinomialNB 10-Cross Validation accuracy:",cross_val_score(self.estimator, X[:,sel.support_], y, cv=5, scoring='accuracy').mean())
            scores.append(score)
        scores, population = np.array(scores), np.array(population) 
        inds = np.argsort(scores)
        return list(scores[inds]), list(population[inds,:])

    def select(self, population_sorted):
        
        population_next = []
        for i in range(self.n_best):
            population_next.append(population_sorted[i])
        for i in range(self.n_rand):
            population_next.append(random.choice(population_sorted))
        random.shuffle(population_next)
        return population_next

    def crossover(self, population):
        population_next = []
        for i in range(int(len(population)/2)):
            for j in range(self.n_children):
                chromosome1, chromosome2 = population[i], population[len(population)-1-i]
                child = chromosome1
                mask = np.random.rand(len(child)) > 0.5
                child[mask] = chromosome2[mask]
                population_next.append(child)
        return population_next
	
    def mutate(self, population):
        population_next = []
        for i in range(len(population)):
            chromosome = population[i]
            if random.random() < self.mutation_rate:
                mask = np.random.rand(len(chromosome)) < 0.05
                chromosome[mask] = False
            population_next.append(chromosome)
        return population_next

    def generate(self, population):
        # Selection, crossover and mutation
        scores_sorted, population_sorted = self.fitness(population)
        population = self.select(population_sorted)
        population = self.crossover(population)
        population = self.mutate(population)
        # History
        self.chromosomes_best.append(population_sorted[0])
        self.scores_best.append(scores_sorted[0])
        print("score : ", scores_sorted[0])
        self.scores_avg.append(np.mean(scores_sorted))
        
        return population

    def fit(self, X, y):
 
        self.chromosomes_best = []
        self.scores_best, self.scores_avg  = [], []
        
        self.dataset = X, y
        self.n_features = X.shape[1]
        
        population = self.initilize()
        for i in range(self.n_gen):
            population = self.generate(population)
            
        return self 
    
    @property
    def support_(self):
        return self.chromosomes_best[-1]

    def plot_scores(self):
        plt.plot(self.scores_best, label='Best')
        plt.plot(self.scores_avg, label='Average')
        plt.legend()
        plt.ylabel('Scores')
        plt.xlabel('Generation')
        plt.show()
# (self.n_best + self.n_rand) / 2) * self.n_children != self.size
sel = GeneticSelector(estimator=MultinomialNB(alpha=.3), 
                       n_gen=100, size=200, n_best=20, n_rand=80, n_children=4, mutation_rate=0.01)
sel.fit(X, y)
sel.plot_scores()
# print("chromosomes_best: ",sel.chromosomes_best)
# score = -1.0 * cross_val_score(est, X[:,sel.support_], y, cv=5, scoring="neg_mean_squared_error")
# print("CV MSE after feature selection: {:.2f}".format(np.mean(score)))
# accuracy = -1.0 *cross_val_score(est, X[:,sel.support_], y, scoring='accuracy', cv = 10).mean()
# # print(accuracy)
# print ("MultinomialNB 10-Cross Validation accuracy:",accuracy)
score = -1.0 *cross_val_score(est, X[:,sel.support_], y, scoring='accuracy', cv = 5)
print("CV MSE after feature selection: {:.2f}".format(np.mean(score)))

MultinomialNB 10-Cross Validation accuracy: -0.9106842643537906
score :  -0.9197605276045913
score :  -0.9181684051039957
score :  -0.9186974479679637
score :  -0.9208393450780991
score :  -0.9211013566277824
score :  -0.921906362048
score :  -0.9219045742695972
score :  -0.9227059983829807
score :  -0.921908865706228
score :  -0.9219113770260359
score :  -0.9232511370179765
score :  -0.9235131533542255
score :  -0.9248500546172744
score :  -0.925384098095738
score :  -0.9251174371714331
score :  -0.9253826711264967
score :  -0.9259238676454971
score :  -0.9267267244743067
score :  -0.9267267244743067
score :  -0.9269966106828498
score :  -0.9267292310037053
score :  -0.9267292310037053
score :  -0.9275320878325152
score :  -0.9275320878325152
score :  -0.9280679219643957
score :  -0.9280679219643957
score :  -0.9280679219643957
score :  -0.9286005356024474
score :  -0.9286012514822725
score :  -0.9286023243404694
score :  -0.9288700638729995
score :  -0.9288697059350086
score :  -0.92

In [13]:
print("chromosomes_best: ",sel.chromosomes_best[-1])

chromosomes_best:  [ True  True False ...  True False  True]


In [None]:
1.3214408061336371
---
n_gen=100, size=200, n_best=20, n_rand=80, n_children=4, mutation_rate=0.01)
-0.9368936152715097
----

CV MSE after feature selection: 13.67
CV MSE after feature selection: 13.64

In [None]:
 n_gen=50, size=40, n_best=4, n_rand=16, n_children=4, mutation_rate=0.01)
0.923780146410006

In [None]:
 n_gen=50, size=200, n_best=20, n_rand=60, n_children=5, mutation_rate=0.05)
1.2292404091555822
-----------------
n_gen=50, size=200, n_best=20, n_rand=60, n_children=5, mutation_rate=0.01)
1.219852777047302
-----------------
n_gen=50, size=200, n_best=20, n_rand=60, n_children=5, mutation_rate=0.01
1.219852777047302
1.15
-----------------
 n_gen=50, size=200, n_best=10, n_rand=70, n_children=5, mutation_rate=0.01
score :  1.2308285914517334
    1.11
-----------------
 n_gen=50, size=320, n_best=10, n_rand=70, n_children=8, mutation_rate=0.01)
    1.2372503569489233
    1.18
 -----------------   
n_gen=50, size=320, n_best=20, n_rand=300, n_children=2, mutation_rate=0.01)
 1.265602689350406
1.15
-----------------
 n_gen=50, size=200, n_best=20, n_rand=80, n_children=4, mutation_rate=0.01)
1.2377726200104433
1.15
-----------------
n_gen=50, size=100, n_best=5, n_rand=35, n_children=5, mutation_rate=0.01)
1.2727858395525313
1.23
-----------------
n_gen=50, size=100, n_best=10, n_rand=30, n_children=5, mutation_rate=0.01)
1.3276356122636421
1.22
----------------
n_gen=50, size=400, n_best=40, n_rand=160, n_children=4, mutation_rate=0.01
1.2011529345999037
1.08
----------------
n_gen=50, size=400, n_best=20, n_rand=180, n_children=4, mutation_rate=0.01)
1.219838801132914
1.12
----------------
n_gen=50, size=400, n_best=60, n_rand=140, n_children=4, mutation_rate=0.01)
1.2128997904157237
1.10
--------------
 n_gen=50, size=400, n_best=60, n_rand=140, n_children=4, mutation_rate=0.01)
    1.10
    1.2128997904157237
--------------
n_gen=500, size=400, n_best=40, n_rand=160, n_children=4, mutation_rate=0.01)
1.1939240208372737
1.12
--------------
n_gen=50, size=400, n_best=40, n_rand=160, n_children=4, mutation_rate=0.01)
-0.933947066856045

In [None]:
out_arr = geek.random.randint(low = 0, high = 3, size = 5) 
print ("Output 1D Array filled with random integers : ", out_arr)