In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Put the data into DataFrames**

In [None]:
result = pd.read_csv('/kaggle/input/brexit-and-ethnicity/results.csv')
con_data = pd.read_csv('/kaggle/input/brexit-and-ethnicity/UK-constituency-data.csv')
con_eth = pd.read_csv('/kaggle/input/brexit-and-ethnicity/Consituancy Ethnicity.csv')

# Check the data sort by constituancy

In [None]:

print(len(result))
result.sort_values('Constituency', inplace=True)
result.head()

In [None]:
print(len(con_data))
con_data.sort_values('PCON14NM', inplace=True)
con_data.head()

In [None]:
print(len(con_eth))
con_eth.sort_values('ConstituencyName', inplace=True)
con_eth.head()

# There are 5 extra rows in con_data (UK has 650 constiuancies). Need to find these non constituancies and remove

In [None]:
# put them all into lists to compare 
result_cons = result['Constituency'].to_list()
con_data_cons = con_data['PCON14NM'].to_list()
con_eth_cons = con_eth['ConstituencyName'].to_list()

In [None]:
# I think the extra entries are regions so will just check on first four chars. I expect some of the consituancies may have been spelt differently 

short_cons = []

for entry in result_cons:
    short_cons.append(entry[:8])

for entry in con_data_cons:
    if entry[:8] not in short_cons:
        print(entry)
 

In [None]:
# Remove these surplus rows

con_data.drop(con_data[con_data['PCON14NM'] == 'Wales'].index, inplace = True)
con_data.drop(con_data[con_data['PCON14NM'] == 'UK'].index, inplace = True)
con_data.drop(con_data[con_data['PCON14NM'] == 'England'].index, inplace = True)
con_data.drop(con_data[con_data['PCON14NM'] == 'Scotland'].index, inplace = True)
con_data.drop(con_data[con_data['PCON14NM'] == 'Northern Ireland'].index, inplace = True)

print(len(con_data))



# Quick check all Dataframes are aligned 

In [None]:
# re run lists to check the constituancies are the same. I expect some differencies in entry but they should be similar. 
result_cons = result['Constituency'].to_list()
con_data_cons = con_data['PCON14NM'].to_list()
con_eth_cons = con_eth['ConstituencyName'].to_list()

# I previously checked all but used enumerate to reduce print out 

for index, (res, data, con) in enumerate(zip(result_cons, con_data_cons, con_eth_cons)):
    if index % 20 == 0:
        print(res, data, con)

# Build the Data Frame by adding the usefull date to results 

In [None]:
con_data.columns

In [None]:
# the following columns are the ones I am interestes in for this one

salary = con_data['salary'].to_list()
nonukborn = con_data['nonukborn'].to_list()
degree = con_data['degree'].to_list()

result['Salary'] = salary
result['nonukborn'] = nonukborn
result['degree'] = degree

result.head()

In [None]:
con_eth.columns

In [None]:
pop_white = con_eth['PopWhiteConst%'].to_numpy() 


result['pop white'] = pop_white


# result.set_index('Constituency', inplace = True)

result.head()


# Convert Leave and Remain from object to float64

In [None]:
result.dtypes

In [None]:
def get_number(number):
    value = number.split("%")[0]
    value = float(value)
    return value

In [None]:
result.Leave = result.Leave.apply(lambda x: get_number(x))
result.Remain = result.Remain.apply(lambda x: get_number(x))
result.nonukborn = pd.to_numeric(result["nonukborn"], downcast="float")
result.degree = pd.to_numeric(result["degree"], downcast="float")

result.head()

# Looking at spread of Leave votes 

In [None]:
bins = [20, 25, 30, 35, 40, 45, 50, 55, 60, 70, 75, 80]
plt.hist(result.Leave, bins = bins, color='#3452eb')
plt.show()

# Correlation of leave %vote to ethinicites and not born in the UK  

In [None]:
ethnicity_columns = ['pop white', 'nonukborn'] 

for column in ethnicity_columns:
    cor_ethnicity_leave = result[['Leave', column]]
    print(cor_ethnicity_leave.corr())
    cor_ethnicity_leave.plot.scatter(x=column, y = 'Leave')
    
# 0.3 is cosidered the threshold for a corralation and this just about acheives it for non white. 
# The non UK born however is stronger. 

In [None]:
columns = ['Salary', 'degree']

for column in columns:
    corr_edu_salary_leave = result[['Leave', column]]
    print(corr_edu_salary_leave.corr())
    corr_edu_salary_leave.plot.scatter(x=column, y = 'Leave')

# Here the corrlation is much stronger 

# Prepare  data for machine learning


In [None]:
# Create a copy of data set for ML model to Leave Win win
remain_leave = result.copy()

# Create a new column for remain win or leave win  

remain_leave['result'] = remain_leave.Remain - remain_leave.Leave

def remain_or_leave(x):
    if x >= 0:
        return 'R'
    else:
        return 'L'

remain_leave['result'] = remain_leave['result'].apply(lambda x: remain_or_leave(x))


In [None]:
# Check 
remain_leave.head()

In [None]:
remain_leave.set_index('Constituency', inplace = True)

In [None]:
y_remain_leave = remain_leave.result
X_remain_leave = remain_leave.drop(columns=['result', 'Leave', 'Remain'])

# Preliminary Classification Model using Decsion Tree Classifier

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X_remain_leave, y_remain_leave, train_size=0.8, test_size=0.2)

from sklearn.tree import DecisionTreeClassifier

#Create a decision tree model
clf=DecisionTreeClassifier(random_state=42)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred=clf.predict(X_valid)

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_valid, y_pred))

d = {'true': y_valid, 'preds': y_pred}

df = pd.DataFrame(data = d)

print(df.head(10))

from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(clf, X_valid, y_valid)

# initial accuracy is promising - try to improve with optimisation

# Plot the tree

In [None]:
 from sklearn import tree

plt = plt.figure(figsize=(10,10))
tree.plot_tree(clf,
         filled = True,
         rounded= True,
         class_names = ['L', 'R'],
         feature_names = X_remain_leave.columns)
plt.show()

# tree is very large. with the optimisation mentioned earlier we will prune this using alpha

# Prune the tree...

After watching 'Decision Trees in Python from Start to Finish' by Josh Starmer (well worth a watch on youtube) I have broken this part down into a similar way as it shows how the pruning is done very well. 

In [None]:
# get a list of alphas

path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
ccp_alphas[:-1]

In [None]:
# run these alphas through the model to get their accuracies

pruned_clfs = []

for alpha in ccp_alphas:
    clf_pruned=DecisionTreeClassifier(ccp_alpha = alpha, random_state=42)
    clf_pruned.fit(X_train,y_train)
    pruned_clfs.append(clf_pruned)

In [None]:
# Plot this is a graph to show graphically how the alpha effects accuracy

train_scores = [clf.score(X_train, y_train) for clf in pruned_clfs]
test_scores = [clf.score(X_valid, y_valid) for clf in pruned_clfs]

import matplotlib.pyplot as plt


fig, ax = plt.subplots(1,1)
ax.set_xlabel('Alpha')
ax.set_ylabel('Acuracy')
ax.set_title('accuracy vs alphas for test and train data')
ax.plot(ccp_alphas, train_scores, marker='o', label = 'train', drawstyle = 'steps-post')
ax.plot(ccp_alphas, test_scores, marker ='o', label = 'test', drawstyle = 'steps-post')
ax.legend()
plt.show()


In [None]:
# is alpha of 0,020 sensitive to data?
from sklearn.model_selection import cross_val_score

opti_clf = DecisionTreeClassifier(ccp_alpha = 0.005)
scores = cross_val_score(opti_clf, X_train, y_train, cv = 5)

df = pd.DataFrame(data = {'tree': range(5), 'accuracy': scores})
df.plot(x='tree', y='accuracy')

# very much so, lets use cross validation to eliminate

# Alpha is sensitive to data so need to cross validate 

In [None]:
looped_alpha_values = []

for alpha in ccp_alphas:
    opti_clf=DecisionTreeClassifier(ccp_alpha = alpha)
    scores = cross_val_score(opti_clf, X_train, y_train, cv = 10)
    looped_alpha_values.append([alpha, np.mean(scores), np.std(scores)])

alpha_results = pd.DataFrame(looped_alpha_values, columns = ['alpha', 'mean accuracy', 'std'] )

alpha_results.plot(x='alpha',
                  y='mean accuracy',
                  yerr = 'std',
                  marker = 'o',
                  linestyle = '--')

In [None]:
# From dataframe above find optimised alpha 

# Locate highest accuracy
m1, m2 = alpha_results['mean accuracy'].nlargest(2).index

alpha_one = alpha_results.iloc[m1,0]
alpha_two = alpha_results.iloc[m2,0]

# Optimised alpha is the mid point between those two alpha accuracies
alpha_optimised = (alpha_one + alpha_two)/2

print(alpha_optimised)



# Now we have the optimised alpha we can put into model 

In [None]:
# Create the model and then display the confusion matrix + accuracy

optimised_tree = DecisionTreeClassifier(ccp_alpha = alpha_optimised)

optimised_tree = optimised_tree.fit(X_train,y_train)

plot_confusion_matrix(optimised_tree,
                     X_valid,
                     y_valid,
                     labels = ['R', 'L'])
print('accuracy =', optimised_tree.score(X_valid, y_valid))

# Model is much better at predicting leave constituancies than remain. This could be because more constituancies voted to leave so the model
# has an inbuilt bias to leave

In [None]:
# Print the tree diagram. Not suprisingly degree is at the top. That had the strongest correlation in the initial analysis

plt.figure(figsize=(15,7.5))
tree.plot_tree(optimised_tree,
         filled = True,
         rounded= True,
         class_names = ['L', 'R'],
         feature_names = X_remain_leave.columns)
plt.show()

# Importance of each feature


In [None]:
# again this is displayed below. Salary was the least important

import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(clf, random_state=1).fit(X_valid, y_valid)

eli5.show_weights(perm, feature_names = X_valid.columns.tolist())

In [None]:
# If we look at the SHAP values we can see again degree has the biggest impact, Im not sure how to interpret the non white graph at the bottom 

from pdpbox import pdp, get_dataset, info_plots
feature_cols = X_train.columns.to_list()

for feature in feature_cols:
    feature_to_plot = 'Distance Covered (Kms)'
    pdp_dist = pdp.pdp_isolate(model=optimised_tree, dataset=X_valid, model_features=feature_cols, feature=feature)
    pdp.pdp_plot(pdp_dist, feature)
    plt.show()