In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **What is the data**

In [None]:
data = pd.read_csv("../input/electionfinance/CandidateSummaryAction1.csv")

In [None]:
data.head()

In [None]:
data.shape


# Prepare data for analysis
1. drop all columns with above 90% missing value

In [None]:
#first, visualize missing values
import missingno as msn
msn.matrix(data)

In [None]:
#process data values

data['cov_sta_dat'] = pd.to_datetime(data['cov_sta_dat'])
data['cov_end_dat'] = pd.to_datetime(data['cov_end_dat'])

data['campaign_duration'] = (data['cov_end_dat'] - data['cov_sta_dat']).dt.days

**Observation** 
1. data contains messy features 
2. create a function to deal with features with high percentage of missing data

In [None]:
#create a function to check all columns with missing data greater than 90% and drop them

def process_missing_data(data, threshold, inplace_value):
    #create a list to hold columns with missing value above threshold
    drop_cols = []
    
    #create a variable to store all columns in the dataframe
    all_cols = data.columns
    
    #calculate all columns with missing values percentage greater than the threshold
    missing_percentage = (data[all_cols].isna().sum()/len(data))*100
    
    #create a dataframe to store all candidate columns and their percentage
    missing_df = pd.DataFrame({"cols":all_cols, "percentage":missing_percentage})
    
    #check for threshold condition
    missing_filtered = missing_df[missing_df['percentage'] >= threshold] 
    drop_cols.append(missing_filtered["cols"].tolist())
    
    #drop candidate columns
    drop_cols = drop_cols[0]
    data.drop(columns=drop_cols, inplace= inplace_value)
    
    return data.shape


In [None]:
#call function on data
process_missing_data(data=data, threshold=90, inplace_value=True)

# **Explore the data**

**First, we see what offices the candidates are campaigning for**


In [None]:
data['can_off'].value_counts(normalize=True, sort=True) * 100

**Basically, there are three offices namely**
* H : house of represenatative
* S : senator
* P : presidency

**create three dataframes based on the candidate office**

In [None]:
#first, convert the net_con column to a float data type and modify data inplace
def converter(data, data_col):
    value = data[data_col].str.replace('$','').str.replace(',','').str.replace('(','-').str.replace(')','').astype('float32')
    data[data_col] = value
    return data.head()

#call the function on the net_con feature
converter(data=data, data_col="net_ope_exp")

**replace all the nan columns in the winner with N as they represent the losers**

In [None]:
data['winner'] = data['winner'].fillna('N')

**inference**
1. we are making use of the net_con feature as this is the feature that represents the total expenses of each candidate leading up to the election period

In [None]:
H_df = data.loc[data['can_off'] == "H"] 
S_df = data.loc[data['can_off'] == "S"]
P_df = data.loc[data['can_off'] == "P"]

In [None]:
#check the shape of the data

print(f'The shape of the House of assembly data is {H_df.shape}')
print(f'The shape of the senate data is {S_df.shape}')
print(f'The shape of the presidential data is {P_df.shape}')

# let's start by analyzing the house of representative data

**Since we are interested in the finances of the campaign, lets see how much is spent on campaign in each district in a state**


In [None]:
Amt_per_sta_ds = H_df.groupby(['can_off_sta', 'can_off_dis'])['net_ope_exp'].sum().to_frame(name = "total_dis_sum").reset_index()

In [None]:
Amt_per_sta_ds.head()

In [None]:
#visualize the state with high spending

plt.figure(figsize=(20,10))

ax = sns.barplot(x="can_off_sta", y="total_dis_sum", data=Amt_per_sta_ds)

We can clearly see that the highest spending state in terms of house of reps election is the MT, let's now go further to analyze the state with the highest net_contribution

In [None]:
mt_comp = H_df.loc[H_df['can_off_sta'] == 'MT']
mt_comp

In [None]:
ax = sns.barplot(x='can_nam', y='net_ope_exp', hue = 'winner',data=mt_comp)

**Observation**
1. **we can see the state has just one district with only two competitors, yet they have the higest rate of spending this can be due to a various reasons that are sadly not contained in the data. suggestions include:**
    * The cost of getting things done in that state is relatively high in comparison to other states
    * Being in direct competition creates just one collision point for the candidates 
2. **we can also see that the candidate with the highest spending won the election**


    

we can clearly see that for state MT with just one district, the higest spender won the vote, however, before we conclude, let's check how long each candidate campaigned for.
* reason behind this is the fact that time plays a role in the maturity of an investment. so assuming the campaign is the investemnt, net_con is the invested capital, how long will it take for the said investement to mature. 
* naturally, the longer you keep your investment, the higher your profit gets, therefore, it should hold that the longer you campaign, the more likely it is for you to win, we check if this assumption holds

In [None]:
mt_comp

In [None]:
ax = sns.barplot(x='can_nam', y='campaign_duration',hue='winner', data=mt_comp)

**Observation**
* The investment assumption holds

**Next, we seek to find out the number of candidates vying for a sit in a district(competitors) generally **

In [None]:
competitors = H_df.groupby(['can_off_sta', 'can_off_dis'])['can_id'].count().to_frame(name = "num_of_comp").reset_index()
#eliminate data points where num_of_comp <= 1
#this means that these positions are unopposed
competitors = competitors[competitors['num_of_comp'] > 1]

In [None]:
competitors.head()

In [None]:
plt.figure(figsize=(20,10))

ax = sns.barplot(x="can_off_dis", y="num_of_comp", data=competitors)

**Observation**
* here we can see every district represented in our dataset clearly

### Before generalizing, let's pick a state at random and test the result of our early analysis

In [None]:
al_comp = H_df.loc[H_df['can_off_sta'] == 'AL']
al_comp.shape

In [None]:
plt.figure(figsize=(20,10))
ax = sns.barplot(x='can_id', y='net_ope_exp', hue = 'winner',data=al_comp)

**Inference**
* assumption on the relationship between how much a candidate spends on election and winning holds
* visualize to see what the range of highest amount spent is

In [None]:
sns.set(rc={'figure.figsize':(12,8)})
sns.set(style="white", color_codes=True)
sns.jointplot(x=H_df["net_ope_exp"], y=H_df["votes"], kind='kde', color="skyblue")

**by this inspection, we can see the concentration of where the most money is spent and the expected voters**

# Haven explored the house of rep dataset, we move to explore senetorial campaigns

In [None]:
S_df.head()

**Insights**
* **Article I, section 3 of the USA Constitution states that** 
* The Senate of the United States shall be composed of two Senators from each State, chosen by the Legislature thereof, for six Years; and each Senator shall have one Vote. Immediately after they shall be assembled in Consequence of the first Election, they shall be divided as equally as may be into three Classes.

**Focus**
* our aim is to check for a relationship in the finance and voting tournout so we proceed in line
* by research the voters column of the dataframe should be empty as the election is not open to public voting

**inference**
* check to ascertain second focus

In [None]:
prf1 = S_df['votes'].isna().count()
prf2 =  len(S_df['votes'])

print(prf1)
print(prf2)

**Observation**
* research holds so we proceed to drop the votes feature in our dataframe


In [None]:
S_df.drop(columns='votes', inplace=True)

In [None]:
S_df.head()

In [None]:
#first, we check amount spent per state 
Amt_per_sta_ds = S_df.groupby(['can_off_sta', 'can_off_dis'])['net_ope_exp'].sum().to_frame(name = "total_dis_sum").reset_index()

In [None]:
Amt_per_sta_ds.head()

In [None]:
#visualize to see the highest spending state

plt.Figure(figsize=(20,10))
ax = sns.barplot(x='can_off_sta', y='total_dis_sum', data=Amt_per_sta_ds)

**Obeservation**
* three states show high total money spent which are **FL, PA and WI**
* analyze the winners of these three states to observe trends

In [None]:
#create the three dataframes
fl_comp = S_df.loc[S_df['can_off_sta'] == 'FL']
pa_comp = S_df.loc[S_df['can_off_sta'] == 'PA']
nv_comp = S_df.loc[S_df['can_off_sta'] == 'NV']

In [None]:
#check winners in FL

ax = sns.barplot(x='can_nam', y='net_ope_exp', hue='winner', data=fl_comp)
plt.xticks(
    rotation=45, 
    horizontalalignment='right',
    fontweight='light',  
)

**Observation**
* the highest spender wasn't the winner, to check why this is happening, we inspect with our investment analogy and check how long they have been campaigning for


In [None]:
fl_comp.head()

In [None]:
ax = sns.barplot(x='can_nam', y='campaign_duration', hue='winner', data=fl_comp)
plt.xticks(
    rotation=45, 
    horizontalalignment='right',
    fontweight='light',  
)

**Observation**
* we still come short in our investment idealogy, next we check for the number of party affiliation in the respective district

In [None]:
ax = sns.barplot(x='can_nam', y='net_ope_exp', hue='can_par_aff', data=fl_comp)
plt.xticks(
    rotation=45, 
    horizontalalignment='right',
    fontweight='light',  
)

**observation**
* we can see a variation in their party affiliations , lets check if they are from the majority or minority party for that state


In [None]:
fl_comp['can_par_aff'].value_counts(normalize=True, sort=True).plot()

**Observation**
* we have our inlier insight as we can see, the party affiliation plays a big role in the voting outcome

In [None]:
#check winners in PA

ax = sns.barplot(x='can_nam', y='net_ope_exp', hue='winner', data=pa_comp)
plt.xticks(
    rotation=45, 
    horizontalalignment='right',
    fontweight='light',  
)

* here we can see that right off the box, the candidate with the highest contribution won the election, we will test the other assumption of investment time and party affiliation next


In [None]:

plt.style.use('seaborn-white')
plt.subplot(121)
ax = sns.barplot(x='can_nam', y='campaign_duration', hue='winner', data=pa_comp)
plt.xticks(
    rotation=45, 
    horizontalalignment='right',
    fontweight='light',  
)
plt.title("campaign duration")


plt.subplot(122)
pa_comp['can_par_aff'].value_counts(normalize=True, sort=True).plot()
plt.xticks(
    rotation=45, 
    horizontalalignment='right',
    fontweight='light',  
)
plt.title("dominant party")




In [None]:

ax = sns.barplot(x='can_nam', y='net_ope_exp', hue='can_par_aff', data=pa_comp)
plt.xticks(
    rotation=45, 
    horizontalalignment='right',
    fontweight='light',  
)
plt.title("campaign duration")


**Inference**
* assumption fails. this can be a function of the coefficient of that variable 

In [None]:
#check winners in PA

ax = sns.barplot(x='can_nam', y='net_ope_exp', hue='winner', data=nv_comp)
plt.xticks(
    rotation=45, 
    horizontalalignment='right',
    fontweight='light',  
)

**Observation**
* holds to our first assumption, as we can see
* test investment and party assumption

In [None]:
plt.style.use('seaborn-white')
plt.subplot(121)
ax = sns.barplot(x='can_nam', y='campaign_duration', hue='winner', data=nv_comp)
plt.xticks(
    rotation=45, 
    horizontalalignment='right',
    fontweight='light',  
)
plt.title("campaign duration")


plt.subplot(122)
nv_comp['can_par_aff'].value_counts(normalize=True, sort=True).plot()
plt.xticks(
    rotation=45, 
    horizontalalignment='right',
    fontweight='light',  
)
plt.title("dominant party")


In [None]:
ax = sns.barplot(x='can_nam', y='net_ope_exp', hue='can_par_aff', data=nv_comp)
plt.xticks(
    rotation=45, 
    horizontalalignment='right',
    fontweight='light',  
)
plt.title("campaign duration")

# Presidential Data Analysis

In [None]:
P_df.head()

In [None]:
P_df.shape

In [None]:
#create dataframe grouped by total amount spent
Amt_per_sta_ds = P_df.groupby(['can_nam', 'winner', 'can_par_aff', 'campaign_duration'])['net_ope_exp'].sum().to_frame(name = "total_dis_sum")

In [None]:
#sort result
Amt_per_sta_ds = Amt_per_sta_ds.sort_values(by = ['total_dis_sum'], ascending=False).reset_index()

In [None]:
#create visualization to reach an assumption on which section of the data points could be candidates to win the election
Amt_per_sta_ds['total_dis_sum'].plot()


In [None]:
# select first 20 data points as candidates
Amt_per_sta_ds = Amt_per_sta_ds.iloc[:20, :]

In [None]:
Amt_per_sta_ds

In [None]:
#check winner
ax = sns.barplot(x='can_nam', y = 'total_dis_sum', hue='winner', data=Amt_per_sta_ds)
plt.xticks(
    rotation=45, 
    horizontalalignment='right',
    fontweight='light',  
)


**observation**
* first assumption did not hold, lets find if our data has the ability to give insight to why this is by analyzing the campaign duration and party affiliation

In [None]:
plt.style.use('seaborn-white')
plt.subplot(121)
ax = sns.barplot(x='can_nam', y='campaign_duration', hue='winner', data=Amt_per_sta_ds)
plt.xticks(
    rotation=45, 
    horizontalalignment='right',
    fontweight='light',  
)
plt.title("campaign duration")


plt.subplot(122)
Amt_per_sta_ds['can_par_aff'].value_counts(normalize=True, sort=True).plot()
plt.xticks(
    rotation=45, 
    horizontalalignment='right',
    fontweight='light',  
)
plt.title("dominant party")

In [None]:
ax = sns.barplot(x='can_nam', y='total_dis_sum', hue='can_par_aff', data=Amt_per_sta_ds)
plt.xticks(
    rotation=45, 
    horizontalalignment='right',
    fontweight='light',  
)
plt.title("campaign duration")

**Observation**
* MYSTERY SOLVED!!! Party affiliations solved the problem

# MODEL BUILDING

* The features that we've analyzed to matter include
    * net_ope_exp
    * winner
    * votes
    * can_par_aff
    * can_off
    * can_off_dis
    * can_off_sta
    * can_inc_cha_ope_sea
    * campaign_duration
    
* create two dataframes for classification and regression tasks
    * create two subframes from the original frames for granularity of prediction
        * Regression_data
            * H_model_data_reg
            * P_model_data_reg
            * S_model_data_reg
        * Classification_data
            * H_model_data_cla
            * P_model_data_cla
            * S_model_data_cla


In [None]:
#create regression data
Regression_data = data[['can_off', 'can_off_sta', 'can_off_dis', 'can_inc_cha_ope_sea', 'net_ope_exp', 'can_par_aff','campaign_duration','votes']]


#create classification data
Classification_data = data[['can_off', 'can_off_sta', 'can_off_dis', 'can_inc_cha_ope_sea', 'net_ope_exp', 'can_par_aff','campaign_duration','winner']]


# **Check regression analysis possibility**

In [None]:
Regression_data.isna().sum()/len(Regression_data)

**Inference**
* approximately 80% of the votes data is missing, this will make regression analysis inaccurate, therefore, we will not look into extracting data for regression analysis

# Proceed to check classification possibility

In [None]:
Classification_data.isna().sum()

**Handle missing data in classification data**

In [None]:
from sklearn.impute import SimpleImputer

imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

Classification_data['can_off_dis'] = imp_mode.fit_transform(Classification_data[['can_off_dis']]).copy()
Classification_data['can_inc_cha_ope_sea'] = imp_mode.fit_transform(Classification_data[['can_inc_cha_ope_sea']]).copy()
Classification_data['net_ope_exp'] = Classification_data['net_ope_exp'].fillna(-99999999999999999999999).copy()
Classification_data['can_par_aff'] = imp_mode.fit_transform(Classification_data[['can_par_aff']]).copy()

In [None]:
Classification_data.isna().sum()

remove a single uninformative data point that affects the pipeline

In [None]:
Classification_data = Classification_data[Classification_data.can_par_aff != 'PPT']

In [None]:
#make respective dataframes
H_model_data_cla = Classification_data.loc[Classification_data['can_off'] == 'H']
P_model_data_cla = Classification_data.loc[Classification_data['can_off'] == 'P']
S_model_data_cla = Classification_data.loc[Classification_data['can_off'] == 'S']

# Build classification pipeline

In [None]:
from sklearn.model_selection import train_test_split

X = H_model_data_cla.iloc[:,:-1]
y = H_model_data_cla.iloc[:,-1]

In [None]:
# determine categorical and numerical features



numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
categorical_ix = X.select_dtypes(include=['object', 'bool']).columns


# define the data preparation for the columns
t = [('cat', OneHotEncoder(), categorical_ix), ('num', MinMaxScaler(), numerical_ix)]
col_transform = ColumnTransformer(transformers=t)

In [None]:
# define base the model

model = XGBClassifier(learning_rate=0.1,min_child_weight=100)
# define the data preparation and modeling pipeline
pipeline = Pipeline(steps=[('prep',col_transform), ('m', model) ])

## Test pipeline and base model on House of rep data

In [None]:
X.isna().sum()

In [None]:
#divide data into train and test split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state=0)

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
y_pred = pipeline.predict(X_val)

In [None]:
print(confusion_matrix(y_true=y_val, y_pred=y_pred))
print(f'accuracy of the base model on house of rep election is {accuracy_score(y_val, y_pred) * 100}%')

# Create function to test different models

In [None]:
def test_model_(models_dict, X_train, y_train, X_val, y_val):
    """
    a function that takes in a dictionary of models along with train and test data
    to calculate the f1_score and accuracy score of the built pipeline then return a dataframe as the output
    
    """
    metrics = {}
    for i in models_dict:
        model_name = str(i)
        model = models_dict[i]
        
        pipeline = Pipeline(steps=[('prep',col_transform), ('m', model) ])
        pipeline.fit(X_train, y_train)
        test_pred = pipeline.predict(X_val)
        metric_1 = accuracy_score(y_val, test_pred) * 100
        metric_2 = f1_score(y_val, test_pred, average='weighted')
        metrics[i] = metric_1, metric_2
        
    metrics_df = pd.DataFrame.from_dict(metrics, orient='index', columns=['Accuracy score', 'f1_score'])
    return metrics_df
        
        


In [None]:
#create a dictionary of classification models
candidate_models = {'xgboost':XGBClassifier(), 'log_reg': LogisticRegression(), 'svm':SVC(), 'random forest': RandomForestClassifier() }

#cal test_model_function
test_model_(candidate_models, X_train, y_train, X_val, y_val)

## kindly upvote or comment, which ever you feel obliged to do 
## Also feel free to copy and reuse as you wish