In [1]:
import pandas as pd
from pathlib import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.decomposition import PCA

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


## Setups

Importing libraries and stating data locations 

In [2]:
# importing the libraries tsert
import pandas as pd
import numpy as np
from sklearn import preprocessing
from imblearn.metrics import classification_report_imbalanced
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score

from collections import Counter

In [3]:
training_data = r"data/train_potus_by_county.csv"
testing_data = r"data/test_potus_by_county.csv"

## Reading in the data

In [4]:
df = pd.read_csv(training_data)

In [5]:
# Checking how the data looks
df.head(5)

Unnamed: 0,Total population,Median age,% BachelorsDeg or higher,Unemployment rate,Per capita income,Total households,Average household size,% Owner occupied housing,% Renter occupied housing,% Vacant housing,Median home value,Population growth,House hold growth,Per capita income growth,Winner
0,9278,37.9,12.6,21.3,13992.0,3802,2.42,51.9,16.6,31.6,63959.0,-0.69,-0.49,0.71,Barack Obama
1,18594,36.3,9.7,14.3,14622.0,6764,2.55,63.7,16.2,20.1,74330.0,-0.13,0.03,0.85,Barack Obama
2,662628,37.9,27.9,12.1,23909.0,267862,2.41,57.0,28.8,14.2,112687.0,-0.09,0.0,0.55,Barack Obama
3,21292,38.9,14.1,15.7,16829.0,8547,2.47,63.5,17.1,19.4,73643.0,-0.59,-0.43,0.57,Barack Obama
4,13252,34.5,15.0,15.8,13012.0,5222,2.47,53.7,20.7,25.6,56642.0,-1.16,-1.03,0.69,Barack Obama


In [6]:
# Checking for null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1213 entries, 0 to 1212
Data columns (total 15 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Total population           1213 non-null   int64  
 1   Median age                 1213 non-null   float64
 2   % BachelorsDeg or higher   1213 non-null   float64
 3   Unemployment rate          1213 non-null   float64
 4   Per capita income          1213 non-null   float64
 5   Total households           1213 non-null   int64  
 6   Average household size     1213 non-null   float64
 7   % Owner occupied housing   1213 non-null   float64
 8   % Renter occupied housing  1213 non-null   float64
 9   % Vacant housing           1213 non-null   float64
 10  Median home value          1213 non-null   float64
 11  Population growth          1213 non-null   float64
 12  House hold growth          1213 non-null   float64
 13  Per capita income growth   1213 non-null   float

In [7]:
# Briefly checking the value distribution
df.describe()

Unnamed: 0,Total population,Median age,% BachelorsDeg or higher,Unemployment rate,Per capita income,Total households,Average household size,% Owner occupied housing,% Renter occupied housing,% Vacant housing,Median home value,Population growth,House hold growth,Per capita income growth
count,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0
mean,101924.8,39.954493,19.18648,9.837593,21118.305853,38051.29,2.493817,60.339324,21.881946,17.779225,118892.893652,0.255754,0.340956,2.03066
std,365905.5,4.652347,8.405849,3.865796,5046.038916,124613.0,0.203625,8.929037,7.398402,10.125619,71060.359705,0.933166,0.939244,0.742566
min,324.0,24.5,6.4,1.0,7908.0,98.0,1.84,15.0,4.6,2.6,29622.0,-2.09,-2.12,0.05
25%,11629.0,37.3,13.4,7.4,17989.0,4555.0,2.36,55.8,17.3,10.8,78313.0,-0.36,-0.25,1.68
50%,25916.0,40.0,17.1,9.6,20265.0,10008.0,2.46,61.8,20.6,15.0,100381.0,0.07,0.16,2.13
75%,67430.0,42.8,22.2,12.1,23217.0,25830.0,2.58,66.2,25.0,21.9,135466.0,0.72,0.78,2.57
max,10240500.0,56.6,61.6,26.5,51818.0,3292577.0,3.71,81.9,74.2,65.9,815417.0,6.07,6.42,4.18


In [8]:
# checking if the data is balanced
df.groupby("Winner").size()

Winner
Barack Obama    264
Mitt Romney     949
dtype: int64

## Binary Encode the Winner column

In [9]:
# Binary encode the the winner column
le = preprocessing.LabelEncoder()
le.fit(df['Winner'])

LabelEncoder()

In [10]:
df['Winner'] = le.transform(df['Winner'].copy())
le.classes_

array(['Barack Obama', 'Mitt Romney'], dtype=object)

In [11]:
# Checking for the encoded version of the tag column
df['Winner'].unique()

array([0, 1])

## Performing a train test split

In [12]:
# Importing the preparation libraries
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split

In [13]:
# Splitting up the feature vs label columns
features_col = [i for i in df.columns if i != "Winner"]

# Changing the data set into numpy array for preparation
X = np.array(df[features_col])
# y = df['Winner'].values.reshape(-1, 1)
y = np.array(df['Winner'])

In [14]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaler = scaler.fit(X)



X_scaled = X_scaler.transform(X)

In [16]:
pca = PCA(n_components=5)
pca.fit(X_scaled)

PCA(n_components=5)

In [14]:
# Perform a simple train test split
X_train, X_test, y_train, y_test = train_test_split(
                                                X, 
                                                y, 
                                                random_state=1,
                                                train_size=0.75,
#                                                 shuffle=True,
                                                stratify=y
                                            )

In [25]:
# Create a classifier object
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(
        n_estimators=100,
        learning_rate=learning_rate,
        max_features=2,
        max_depth=3,
        random_state=0
    )

    # Fit the model
    classifier.fit(X_train_scaled, y_train.ravel())
    print("Learning rate: ", learning_rate)

    # Score the model
    print("Accuracy score (train): {0:.3f}".format(
        classifier.score(
            X_train_scaled,
            y_train.ravel())))
    print("Accuracy score (test): {0:.3f}".format(
        classifier.score(
            X_test_scaled,
            y_test.ravel())))
    print()

Learning rate:  0.05
Accuracy score (train): 0.915
Accuracy score (test): 0.819

Learning rate:  0.1
Accuracy score (train): 0.936
Accuracy score (test): 0.829

Learning rate:  0.25
Accuracy score (train): 0.981
Accuracy score (test): 0.826

Learning rate:  0.5
Accuracy score (train): 1.000
Accuracy score (test): 0.803

Learning rate:  0.75
Accuracy score (train): 1.000
Accuracy score (test): 0.796

Learning rate:  1
Accuracy score (train): 1.000
Accuracy score (test): 0.819



In [26]:
# Choose a learning rate and create classifier
classifier = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_features=2,
    max_depth=3,
    random_state=0
)

# Fit the model
classifier.fit(X_train_scaled, y_train.ravel())

# Make Prediction
predictions = classifier.predict(X_test_scaled)
pd.DataFrame({"Prediction": predictions, "Actual": y_test.ravel()}).head(20)

Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,1,1
3,1,1
4,1,0
5,1,1
6,0,0
7,1,0
8,1,1
9,1,1


In [27]:
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.8289473684210527


In [28]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"]
)

# Displaying results
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,21,45
Actual 1,7,231


In [29]:
# Generate classification report
print("Classification Report")
print(classification_report(y_test, predictions))

Classification Report
              precision    recall  f1-score   support

           0       0.75      0.32      0.45        66
           1       0.84      0.97      0.90       238

    accuracy                           0.83       304
   macro avg       0.79      0.64      0.67       304
weighted avg       0.82      0.83      0.80       304

