In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from collections import Counter

from itertools import combinations

from sklearn.ensemble import IsolationForest

pd.set_option('display.max_columns', 100) # to display all columns at all time
pd.options.mode.chained_assignment = None # to ignore false-positive warnings about chained assignments

In [2]:
data = pd.read_csv('ElectionsData.csv', header=0)

In [3]:
def count_categories(attr):
    return len(data[attr].astype('category').cat.categories)

obj_attr = [(col, count_categories(col))  for col in data if data[col].dtype==np.object]
obj_attr_names = map(lambda x: x[0], obj_attr)
for attr, cnt in obj_attr:
    print "%-30s %5d %s" % (attr,cnt, "BINARY" if cnt==2 else "")

Vote                              10 
Most_Important_Issue               8 
Looking_at_poles_results           2 BINARY
Married                            2 BINARY
Gender                             2 BINARY
Voting_Time                        2 BINARY
Will_vote_only_large_party         2 BINARY
Age_group                          3 
Main_transportation                4 
Occupation                         5 
Financial_agenda_matters           2 BINARY


In [4]:
# XXXX REMOVE THIS. THIS IS ONLY BECAUSE OF THE ASSUMPTION THAT AT THIS POINT WE DON'T HAVE NaN's
data = data.dropna()

# Handle binary columns (Gender, Married, etc.)
for attr,cnt in obj_attr:
        data[attr] = data[attr].astype('category')

data['Gender_Int'] = data['Gender'].map({'Female':0, 'Male':1}).astype(int)
data['Voting_Time_Int'] = data['Voting_Time'].map({'By_16:00':0, 'After_16:00':1}).astype(int)

data = data.drop(['Gender','Voting_Time'],axis=1)

for attr in ['Married','Looking_at_poles_results','Financial_agenda_matters','Will_vote_only_large_party']:
    data[attr+'_Int'] = data[attr].map({'No':0, 'Yes':1}).astype(int)
    data = data.drop(attr,axis=1)

# Handle categorical columns and add one-hot vectors
for attr in ['Most_Important_Issue','Main_transportation','Occupation']:
    data = pd.concat([data, pd.get_dummies(data[attr],prefix=attr)], axis=1)
    data = data.drop(attr,axis=1)

# For convenience, we want 'Vote_Int' to be at the beginning
for attr,cnt in obj_attr:
    if attr=='Vote':
        data[attr] = data[attr].astype('category').cat.rename_categories(range(1,cnt+1)).astype('float')

data['Age_group_Int'] = data['Age_group'].map({'Below_30':0, '30-45':1, '45_and_up':2}).astype(int)
data = data.drop(['Age_group'],axis=1)



In [5]:
data

Unnamed: 0,Vote,Occupation_Satisfaction,Avg_monthly_expense_when_under_age_21,AVG_lottary_expanses,Avg_Residancy_Altitude,Yearly_ExpensesK,Financial_balance_score_(0-1),%Of_Household_Income,Avg_government_satisfaction,Avg_education_importance,Avg_environmental_importance,Avg_Satisfaction_with_previous_vote,Avg_monthly_income_all_years,%Time_invested_in_work,Yearly_IncomeK,Avg_monthly_expense_on_pets_or_plants,Avg_monthly_household_cost,Phone_minutes_10_years,Avg_size_per_room,Weighted_education_rank,%_satisfaction_financial_policy,Garden_sqr_meter_per_person_in_residancy_area,Last_school_grades,Number_of_differnt_parties_voted_for,Political_interest_Total_Score,Number_of_valued_Kneset_members,Overall_happiness_score,Num_of_kids_born_last_10_years,Gender_Int,Voting_Time_Int,Married_Int,Looking_at_poles_results_Int,Financial_agenda_matters_Int,Will_vote_only_large_party_Int,Most_Important_Issue_Education,Most_Important_Issue_Environment,Most_Important_Issue_Financial,Most_Important_Issue_Foreign_Affairs,Most_Important_Issue_Healthcare,Most_Important_Issue_Military,Most_Important_Issue_Other,Most_Important_Issue_Social,Main_transportation_Car,Main_transportation_Foot_or_bicycle,Main_transportation_Motorcycle_or_truck,Main_transportation_Public_or_other,Occupation_Hightech,Occupation_Industry_or_other,Occupation_Public_Sector,Occupation_Services_or_Retail,Occupation_Student_or_Unemployed,Age_group_Int
0,10.0,9.0,901.074249,174664.625237,300.358083,581.521308,0.920727,0.100484,4.365483,2.161798,5.962218,6.106218,9204.0,53.981652,622.909384,7061.031848,80356.465721,338167.031649,24.958153,7.775210e+05,43.518864,90525.336049,100.0,1.0,1067.419550,14.0,897.297947,0.0,1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,2
1,8.0,4.0,172.175470,23146.728293,57.391823,403.310559,0.502451,0.540302,3.046957,6.447056,1.486424,6.706155,6238.0,63.899160,710.857600,5454.820791,83550.459496,162659.407111,26.661913,1.011912e+06,45.259922,3361.213190,100.0,0.0,1111.093591,10.0,861.981391,0.0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1
2,9.0,3.0,408.262789,97198.945759,136.087596,714.238096,0.899025,0.053332,4.178738,1.761754,3.059453,9.383902,3912.0,29.626594,865.494572,8873.370105,100227.594707,510136.057819,29.419289,1.499705e+06,25.612407,18665.921495,40.0,1.0,1260.064170,20.0,819.390492,1.0,1,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1
3,1.0,4.0,401.755874,52952.486970,133.918625,395.407936,0.778608,0.845698,3.218688,4.677225,3.427241,0.746178,4040.0,48.221850,562.297360,5078.674075,65151.482447,156347.435485,23.712810,6.333466e+05,39.306687,18078.116698,60.0,3.0,837.907521,5.0,584.598207,1.0,1,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1
4,1.0,5.0,210.935361,29798.270086,70.311787,423.801917,0.278861,0.654061,3.454708,0.886536,1.984614,1.858172,9276.0,78.740853,577.028056,5392.075284,67056.617613,179608.065018,24.021408,6.669518e+05,17.378412,5024.059163,80.0,2.0,859.308498,4.0,595.206111,1.0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1
5,10.0,1.0,245.727791,34939.741843,81.909264,426.566426,0.956243,0.698686,0.175241,6.287463,3.886282,8.237665,4742.0,37.631324,520.626097,5306.916451,61232.104508,181958.915515,22.817232,5.430757e+05,3.325105,6801.036742,60.0,3.0,779.272032,1.0,536.103805,0.0,0,1,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0
6,5.0,3.0,176.199132,20881.681123,58.733044,355.535483,0.216134,0.425945,9.580903,6.754966,6.262171,1.593651,9754.0,16.757709,531.117152,4617.589132,65271.724476,126405.479608,23.045979,5.652641e+05,64.067057,3518.303489,100.0,1.0,877.433966,18.0,727.749963,0.0,1,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,2
7,4.0,4.0,658.354759,148173.767645,219.451586,675.200258,0.975477,0.066626,0.334810,4.534706,5.815745,0.664623,5166.0,50.794958,614.771717,7981.546012,75803.304283,455895.388013,24.794590,7.572321e+05,32.645866,48388.450329,60.0,0.0,950.013756,10.0,658.398370,5.0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
8,2.0,8.0,1207.000512,152141.832727,402.333504,378.148554,0.789854,0.037650,3.168378,9.579997,2.597317,9.785732,7634.0,55.871403,790.938181,5363.361902,89437.082515,142996.328866,28.123623,1.252413e+06,94.871960,162284.581896,60.0,5.0,1178.648405,15.0,857.978374,2.0,1,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0
9,1.0,8.0,705.293705,145221.386551,235.097902,617.706009,0.554391,0.068890,7.529331,4.666677,9.170213,2.408138,6543.0,16.813587,673.995404,7525.050900,80049.900558,381560.713782,25.961421,9.098468e+05,73.582174,55516.121227,100.0,2.0,1008.133119,18.0,679.533308,1.0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,2


In [6]:
outliers_fraction = 0.05

clf = IsolationForest(n_estimators=data.shape[0]/10, max_samples=0.5, contamination=outliers_fraction, n_jobs=-1)

clf.fit(data)
y_pred = clf.predict(data)
B = np.where(y_pred==-1)[0]
print B
print len(B)

[   8   33   34   43   45   54   79   85   96  100  109  125  144  164  182
  185  206  257  308  318  351  394  395  407  410  424  444  447  453  458
  459  467  474  492  497  508  549  571  575  583  584  602  616  638  664
  686  739  744  750  751  762  792  805  815  848  850  872  881  931  961
  965  991 1019 1058 1067 1088 1108 1162 1196 1202 1210 1250 1275 1318 1329
 1353 1370 1385 1396 1435 1437 1454 1497 1504 1550 1554 1561 1621 1646 1669
 1694 1799 1816 1820 1825 1841 1842 1848 1860 1893 1897 1920 1927 1987 1995
 2001 2115 2123 2126 2151 2165 2168 2201 2219 2249 2259 2312 2334 2346 2352
 2364 2405 2421 2425 2431 2438 2441 2446 2455 2456 2472 2502 2535 2542 2550
 2570 2620 2637 2699 2712 2723 2763 2854 2886 2895 2928 2930 2931 2941 2945
 2970 2984 2999 3000 3005 3017 3031 3039 3041 3061 3083 3104 3109 3123 3136
 3137 3163 3175 3182 3196 3230 3264 3274 3298 3337 3399 3403 3428 3433 3452
 3460 3494 3496 3547 3567 3597 3608 3612 3645 3652 3661 3693 3698 3759 3770
 3785 3787 3

## Normalization

In [7]:
from sklearn.preprocessing import scale
df_scaled = scale(data)

In [8]:
def count_categories(attr):
    return len(data[attr].astype('category').cat.categories)

attrs = [attr for attr in data if count_categories(attr) > 2 and attr != 'Vote']

#for attr in attrs:
#    data[attr] = scale(data[attr])

In [9]:
data

Unnamed: 0,Vote,Occupation_Satisfaction,Avg_monthly_expense_when_under_age_21,AVG_lottary_expanses,Avg_Residancy_Altitude,Yearly_ExpensesK,Financial_balance_score_(0-1),%Of_Household_Income,Avg_government_satisfaction,Avg_education_importance,Avg_environmental_importance,Avg_Satisfaction_with_previous_vote,Avg_monthly_income_all_years,%Time_invested_in_work,Yearly_IncomeK,Avg_monthly_expense_on_pets_or_plants,Avg_monthly_household_cost,Phone_minutes_10_years,Avg_size_per_room,Weighted_education_rank,%_satisfaction_financial_policy,Garden_sqr_meter_per_person_in_residancy_area,Last_school_grades,Number_of_differnt_parties_voted_for,Political_interest_Total_Score,Number_of_valued_Kneset_members,Overall_happiness_score,Num_of_kids_born_last_10_years,Gender_Int,Voting_Time_Int,Married_Int,Looking_at_poles_results_Int,Financial_agenda_matters_Int,Will_vote_only_large_party_Int,Most_Important_Issue_Education,Most_Important_Issue_Environment,Most_Important_Issue_Financial,Most_Important_Issue_Foreign_Affairs,Most_Important_Issue_Healthcare,Most_Important_Issue_Military,Most_Important_Issue_Other,Most_Important_Issue_Social,Main_transportation_Car,Main_transportation_Foot_or_bicycle,Main_transportation_Motorcycle_or_truck,Main_transportation_Public_or_other,Occupation_Hightech,Occupation_Industry_or_other,Occupation_Public_Sector,Occupation_Services_or_Retail,Occupation_Student_or_Unemployed,Age_group_Int
0,10.0,9.0,901.074249,174664.625237,300.358083,581.521308,0.920727,0.100484,4.365483,2.161798,5.962218,6.106218,9204.0,53.981652,622.909384,7061.031848,80356.465721,338167.031649,24.958153,7.775210e+05,43.518864,90525.336049,100.0,1.0,1067.419550,14.0,897.297947,0.0,1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,2
1,8.0,4.0,172.175470,23146.728293,57.391823,403.310559,0.502451,0.540302,3.046957,6.447056,1.486424,6.706155,6238.0,63.899160,710.857600,5454.820791,83550.459496,162659.407111,26.661913,1.011912e+06,45.259922,3361.213190,100.0,0.0,1111.093591,10.0,861.981391,0.0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1
2,9.0,3.0,408.262789,97198.945759,136.087596,714.238096,0.899025,0.053332,4.178738,1.761754,3.059453,9.383902,3912.0,29.626594,865.494572,8873.370105,100227.594707,510136.057819,29.419289,1.499705e+06,25.612407,18665.921495,40.0,1.0,1260.064170,20.0,819.390492,1.0,1,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1
3,1.0,4.0,401.755874,52952.486970,133.918625,395.407936,0.778608,0.845698,3.218688,4.677225,3.427241,0.746178,4040.0,48.221850,562.297360,5078.674075,65151.482447,156347.435485,23.712810,6.333466e+05,39.306687,18078.116698,60.0,3.0,837.907521,5.0,584.598207,1.0,1,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1
4,1.0,5.0,210.935361,29798.270086,70.311787,423.801917,0.278861,0.654061,3.454708,0.886536,1.984614,1.858172,9276.0,78.740853,577.028056,5392.075284,67056.617613,179608.065018,24.021408,6.669518e+05,17.378412,5024.059163,80.0,2.0,859.308498,4.0,595.206111,1.0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1
5,10.0,1.0,245.727791,34939.741843,81.909264,426.566426,0.956243,0.698686,0.175241,6.287463,3.886282,8.237665,4742.0,37.631324,520.626097,5306.916451,61232.104508,181958.915515,22.817232,5.430757e+05,3.325105,6801.036742,60.0,3.0,779.272032,1.0,536.103805,0.0,0,1,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0
6,5.0,3.0,176.199132,20881.681123,58.733044,355.535483,0.216134,0.425945,9.580903,6.754966,6.262171,1.593651,9754.0,16.757709,531.117152,4617.589132,65271.724476,126405.479608,23.045979,5.652641e+05,64.067057,3518.303489,100.0,1.0,877.433966,18.0,727.749963,0.0,1,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,2
7,4.0,4.0,658.354759,148173.767645,219.451586,675.200258,0.975477,0.066626,0.334810,4.534706,5.815745,0.664623,5166.0,50.794958,614.771717,7981.546012,75803.304283,455895.388013,24.794590,7.572321e+05,32.645866,48388.450329,60.0,0.0,950.013756,10.0,658.398370,5.0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
8,2.0,8.0,1207.000512,152141.832727,402.333504,378.148554,0.789854,0.037650,3.168378,9.579997,2.597317,9.785732,7634.0,55.871403,790.938181,5363.361902,89437.082515,142996.328866,28.123623,1.252413e+06,94.871960,162284.581896,60.0,5.0,1178.648405,15.0,857.978374,2.0,1,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0
9,1.0,8.0,705.293705,145221.386551,235.097902,617.706009,0.554391,0.068890,7.529331,4.666677,9.170213,2.408138,6543.0,16.813587,673.995404,7525.050900,80049.900558,381560.713782,25.961421,9.098468e+05,73.582174,55516.121227,100.0,2.0,1008.133119,18.0,679.533308,1.0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,2


In [10]:
#S = set(['Vote','Overall_happiness_score', 'Looking_at_poles_results_Int', 'Phone_minutes_10_years', 'AVG_lottary_expanses', 'Political_interest_Total_Score', 'Avg_environmental_importance', 'Last_school_grades', 'Weighted_education_rank', 'Avg_monthly_expense_on_pets_or_plants'])
#accuracy = 0.866307541626

In [11]:
from sklearn.ensemble import RandomForestClassifier 
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn import metrics

noNaN = data.dropna()
train_data_X_noNaN = noNaN.drop(['Vote'], axis=1).values
train_data_Y_noNaN = noNaN.Vote.values



In [12]:
avg_tree = avg_svm = 0
for i in range(10):
    
    # Prepare train and test data using cross validation
    X_train_noNaN, X_test_noNaN, Y_train_noNaN, Y_test_noNaN = train_test_split(train_data_X_noNaN, 
                                                                        train_data_Y_noNaN)
    
    # Example usage 1
    forest = RandomForestClassifier(n_estimators = 10)
    clf = forest.fit(X_train_noNaN, Y_train_noNaN)
    Y_pred_noNaN = clf.predict(X_test_noNaN)
    avg_tree += metrics.accuracy_score(Y_test_noNaN, Y_pred_noNaN)

    # Example usage 2
    svm = SVC()
    clf = svm.fit(X_train_noNaN, Y_train_noNaN)
    Y_pred_noNaN = clf.predict(X_test_noNaN)
    avg_svm += metrics.accuracy_score(Y_test_noNaN, Y_pred_noNaN)
    
print avg_tree/10, avg_svm/10

0.851909892262 0.267482859941


## Remove outliers, just for now

In [13]:
df = data.drop(data.index[B])

df_noNaN = df[list(S)].dropna()
df_train_data_X_noNaN = df_noNaN.drop(['Vote'], axis=1).values
df_train_data_Y_noNaN = df_noNaN.Vote.values


NameError: name 'S' is not defined

In [None]:
avg_tree = avg_svm = 0
for i in range(10):
    
    # Prepare train and test data using cross validation
    df_X_train_noNaN, df_X_test_noNaN, df_Y_train_noNaN, df_Y_test_noNaN = train_test_split(df_train_data_X_noNaN, 
                                                                            df_train_data_Y_noNaN)

    # Example usage 1
    forest = RandomForestClassifier(n_estimators = 10)
    clf = forest.fit(df_X_train_noNaN, df_Y_train_noNaN)
    df_Y_pred_noNaN = clf.predict(df_X_test_noNaN)
    avg_tree += metrics.accuracy_score(df_Y_test_noNaN, df_Y_pred_noNaN)

    # Example usage 2
    svm = SVC()
    clf = svm.fit(df_X_train_noNaN, df_Y_train_noNaN)
    df_Y_pred_noNaN = clf.predict(df_X_test_noNaN)
    avg_svm += metrics.accuracy_score(df_Y_test_noNaN, df_Y_pred_noNaN)

print avg_tree/10, avg_svm/10