# Bonus - Triplets part

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from itertools import combinations

pd.set_option('display.max_columns', 100) # to display all columns at all time
pd.options.mode.chained_assignment = None # to ignore false-positive warnings about chained assignments
data = pd.read_csv('ElectionsData.csv', header=0)

## IMPORTANT!!!
We need to convert everything to something that is not 'category' so either float or int.

This is because we can't use train_test_split on anything that is category.


In [None]:
def count_categories(attr):
    return len(data[attr].astype('category').cat.categories)

obj_attr = [(col, count_categories(col))  for col in data if data[col].dtype==np.object]

# XXXX REMOVE THIS. THIS IS ONLY BECAUSE OF THE ASSUMPTION THAT AT THIS POINT WE DON'T HAVE NaN's
data = data.dropna()

# Handle binary columns (Gender, Married, etc.)
for attr,cnt in obj_attr:
        data[attr] = data[attr].astype('category')

data['Gender_Int'] = data['Gender'].map({'Female':0, 'Male':1}).astype(int)
data['Voting_Time_Int'] = data['Voting_Time'].map({'By_16:00':0, 'After_16:00':1}).astype(int)

data = data.drop(['Gender','Voting_Time'],axis=1)

for attr in ['Married','Looking_at_poles_results','Financial_agenda_matters','Will_vote_only_large_party']:
    data[attr+'_Int'] = data[attr].map({'No':0, 'Yes':1}).astype(int)
    data = data.drop(attr,axis=1)

# Handle categorical columns and add one-hot vectors
for attr in ['Most_Important_Issue','Main_transportation','Occupation']:
    data = pd.concat([data, pd.get_dummies(data[attr],prefix=attr)], axis=1)
    data = data.drop(attr,axis=1)
    
# For convenience, we want 'Vote_Int' to be at the beginning
for attr,cnt in obj_attr:
    if attr=='Vote':
        data[attr] = data[attr].astype('category').cat.rename_categories(range(1,cnt+1)).astype('float')

data['Age_group_Int'] = data['Age_group'].map({'Below_30':0, '30-45':1, '45_and_up':2}).astype(int)
data = data.drop(['Age_group'],axis=1)



## Relief Algorithm

In [None]:
def find_closest(data,index,row):
    nearhit_dist = nearmiss_dist = None
    nearhit = nearmiss = None

    for idx, cur_row in data.iterrows():
        if idx == index:
            continue
        cur_vote = cur_row.Vote
        dist = sum([(row[c]-cur_row[c])**2 for c in data if c != 'Vote'])
        if cur_vote == row.Vote:
            if nearhit_dist is None or dist < nearhit_dist:
                nearhit_dist = dist
                nearhit = cur_row
        else:
            if nearmiss_dist is None or dist < nearmiss_dist:
                nearmiss_dist = dist
                nearmiss = cur_row
    return nearhit, nearmiss
        

def relief(data, samples=0.2,tau=0):
    weights = {}
    #initialize the weights
    for f in data.columns.values:
        if f == 'Vote':
            continue
        weights[f] = 0
    #go over the samples
    i = 0
    for index, row in data.sample(frac=samples).iterrows():
        i = i + 1
        print "i =", i
        vote = row.Vote
        #find nearest from class, and its index
        #find nearest from outside class
        nearhit, nearmiss = find_closest(data,index,row)
        for f in data.columns.values:
            if f == 'Vote':
                continue
            #weights[f] = weights[f] + (xi-nearmiss(xi))^2 - (xi-nearhit(xi))^2
            weights[f] = weights[f] + (row[f] - nearmiss[f])**2 - (row[f] - nearhit[f])**2
        attrs = [attr for attr, w in weights.iteritems() if w>tau]
        print len(attrs)
    return attrs
    
attrs = relief(data.dropna(),0.001)
attrs

In [None]:
len(attrs)
open_set = set(attrs)

## SFS

In [None]:
from sklearn.ensemble import RandomForestClassifier 
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn import metrics

def _sfs(df,learning_model,S=None,cur_max=0):
    if S is None:
        S = set(['Vote'])
    #print S
    max_feature = None
    for col in [c for c in data if c not in S]:
        S.add(col)
        noNaN = df[list(S)].dropna()
        train_data_X_noNaN = noNaN.drop(['Vote'], axis=1).values
        train_data_Y_noNaN = noNaN.Vote.values
        # Prepare train and test data using cross validation
        X_train_noNaN, X_test_noNaN, Y_train_noNaN, Y_test_noNaN = train_test_split(train_data_X_noNaN, 
                                                                                train_data_Y_noNaN)
        clf = learning_model.fit(X_train_noNaN, Y_train_noNaN)
        Y_pred_noNaN = clf.predict(X_test_noNaN)
        tmp = metrics.accuracy_score(Y_test_noNaN, Y_pred_noNaN)
        if(tmp > cur_max):
            cur_max = tmp
            max_feature = col
        S.remove(col)
    if (max_feature is not None):
        S.add(max_feature)
        #print cur_max
        S, cur_max = _sfs(df,learning_model,S,cur_max)
    return S, cur_max


#df: dataframe, d-dimensional feature-set
#learning_model: classifier by which to measure predictive power, higher = better
#iterations: this is to allow restarts because the learning_model may be random. picks the best subset out of all
def sfs(df,learning_model,iterations=1):
    best_score = 0
    best_subset = set(['Vote'])
    #because the function is susceptible to local maximums,
    #we will run it with random restarts and take the best subset
    for i in range(iterations):
        S, score = _sfs(df,learning_model)
        if(score > best_score):
            best_score = score
            best_subset = S
    best_subset.remove('Vote')
    return best_subset, best_score

# Example usage 1
forest = RandomForestClassifier(n_estimators = 15)
S, accuracy = sfs(data,forest,5)
print S, accuracy

# Example usage 2
svm = SVC()
S, accuracy = sfs(data,svm)
print S, accuracy

## Hybrid Feature Selection

In [None]:
from sklearn.ensemble import RandomForestClassifier 
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn import metrics

def _bds(df,learning_model,sfs_open=None):
    if open_set is None:
        sfs_open = set(df.columns.values)
        sfs_open.remove('Vote')
    sbs_open = set.copy(sfs_open)
    
    S_sfs = set()
    S_sbs = set.copy(sfs_open)
    
    S_max = None
    
    cur_max_forward = 0
    cur_max_backward = 0
    cur_max = 0
    
    train_data_Y_noNaN = df.dropna().Vote.values
    
    while S_sfs != S_sbs:
        
        max_feature = None
        min_feature = None
        changed = False
        
        print "#### FORWARD ####"
        print S_sfs
        #forward selection
        #find a feature in sfs_open and add it
        for f in sfs_open:
            S_sfs.add(f)
            noNaN = df[list(S_sfs)].dropna()
            
            train_data_X_noNaN = noNaN.values
            
            # Prepare train and test data using cross validation
            X_train_noNaN, X_test_noNaN, Y_train_noNaN, Y_test_noNaN = train_test_split(train_data_X_noNaN, 
                                                                                    train_data_Y_noNaN)
            clf = learning_model.fit(X_train_noNaN, Y_train_noNaN)
            Y_pred_noNaN = clf.predict(X_test_noNaN)
            tmp = metrics.accuracy_score(Y_test_noNaN, Y_pred_noNaN)
            
            #pick the best feature to add
            if max_feature is None:
                max_feature = f
            if tmp > cur_max_forward:
                cur_max_forward = tmp
                max_feature = f
            if tmp > cur_max:
                cur_max = tmp
                S_max = S_sfs
            S_sfs.remove(f)
        if (max_feature is not None):
            S_sfs.add(max_feature)
            #sbs can't remove feature selected by sfs
            sbs_open.remove(max_feature)
            sfs_open.remove(max_feature)
            
        print "#### BACKWARD ####"
        print S_sbs
        #backward selection
        for f in sbs_open:
            S_sbs.remove(f)
            noNaN = df[list(S_sbs)].dropna()
            
            train_data_X_noNaN = noNaN.values
            
            # Prepare train and test data using cross validation
            X_train_noNaN, X_test_noNaN, Y_train_noNaN, Y_test_noNaN = train_test_split(train_data_X_noNaN, 
                                                                                    train_data_Y_noNaN)
            clf = learning_model.fit(X_train_noNaN, Y_train_noNaN)
            Y_pred_noNaN = clf.predict(X_test_noNaN)
            tmp = metrics.accuracy_score(Y_test_noNaN, Y_pred_noNaN)
            
            #find least damaging feature to remove
            if min_feature is None:
                min_feature = f
            if(tmp > cur_max_backward):
                cur_max_backward = tmp
                min_feature = f
            if tmp > cur_max:
                cur_max = tmp
                S_max = S_sfs
            S_sbs.add(f)
        if (min_feature is not None):
            S_sbs.remove(min_feature)
            sfs_open.remove(min_feature)
            sbs_open.remove(min_feature)
    return S_max, cur_max


#df: dataframe
#learning_model: classifier by which to measure predictive power, higher = better
#iterations: this is to allow restarts because the learning_model may be random. picks the best subset out of all
def bds(df,learning_model,iterations=1,open_set=None):
    best_score = 0
    best_subset = None
    #because the function is susceptible to local maximums,
    #we will run it with random restarts and take the best subset
    for i in range(iterations):
        S, score = _bds(df,learning_model,open_set)
        if(score > best_score):
            best_score = score
            best_subset = S
    return best_subset, best_score

# Example usage 1
forest = RandomForestClassifier(n_estimators = 15)
S, accuracy = bds(data,forest,iterations=1,open_set=open_set)
print S, accuracy

# Example usage 2
#svm = SVC()
#S, accuracy = sfs(data,svm)
#print S, accuracy

In [None]:
S

In [None]:
S