## Setups

Importing libraries and stating data locations 

In [1]:
# importing the libraries tsert
import pandas as pd
import numpy as np
from sklearn import preprocessing


In [2]:
training_data = r"data/train_potus_by_county.csv"
testing_data = r"data/test_potus_by_county.csv"

## Reading in the data

In [3]:
df = pd.read_csv(training_data)

In [4]:
# Checking how the data looks
df.head(5)

Unnamed: 0,Total population,Median age,% BachelorsDeg or higher,Unemployment rate,Per capita income,Total households,Average household size,% Owner occupied housing,% Renter occupied housing,% Vacant housing,Median home value,Population growth,House hold growth,Per capita income growth,Winner
0,9278,37.9,12.6,21.3,13992.0,3802,2.42,51.9,16.6,31.6,63959.0,-0.69,-0.49,0.71,Barack Obama
1,18594,36.3,9.7,14.3,14622.0,6764,2.55,63.7,16.2,20.1,74330.0,-0.13,0.03,0.85,Barack Obama
2,662628,37.9,27.9,12.1,23909.0,267862,2.41,57.0,28.8,14.2,112687.0,-0.09,0.0,0.55,Barack Obama
3,21292,38.9,14.1,15.7,16829.0,8547,2.47,63.5,17.1,19.4,73643.0,-0.59,-0.43,0.57,Barack Obama
4,13252,34.5,15.0,15.8,13012.0,5222,2.47,53.7,20.7,25.6,56642.0,-1.16,-1.03,0.69,Barack Obama


In [5]:
# Checking for null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1213 entries, 0 to 1212
Data columns (total 15 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Total population           1213 non-null   int64  
 1   Median age                 1213 non-null   float64
 2   % BachelorsDeg or higher   1213 non-null   float64
 3   Unemployment rate          1213 non-null   float64
 4   Per capita income          1213 non-null   float64
 5   Total households           1213 non-null   int64  
 6   Average household size     1213 non-null   float64
 7   % Owner occupied housing   1213 non-null   float64
 8   % Renter occupied housing  1213 non-null   float64
 9   % Vacant housing           1213 non-null   float64
 10  Median home value          1213 non-null   float64
 11  Population growth          1213 non-null   float64
 12  House hold growth          1213 non-null   float64
 13  Per capita income growth   1213 non-null   float

In [6]:
# Briefly checking the value distribution
df.describe()

Unnamed: 0,Total population,Median age,% BachelorsDeg or higher,Unemployment rate,Per capita income,Total households,Average household size,% Owner occupied housing,% Renter occupied housing,% Vacant housing,Median home value,Population growth,House hold growth,Per capita income growth
count,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0
mean,101924.8,39.954493,19.18648,9.837593,21118.305853,38051.29,2.493817,60.339324,21.881946,17.779225,118892.893652,0.255754,0.340956,2.03066
std,365905.5,4.652347,8.405849,3.865796,5046.038916,124613.0,0.203625,8.929037,7.398402,10.125619,71060.359705,0.933166,0.939244,0.742566
min,324.0,24.5,6.4,1.0,7908.0,98.0,1.84,15.0,4.6,2.6,29622.0,-2.09,-2.12,0.05
25%,11629.0,37.3,13.4,7.4,17989.0,4555.0,2.36,55.8,17.3,10.8,78313.0,-0.36,-0.25,1.68
50%,25916.0,40.0,17.1,9.6,20265.0,10008.0,2.46,61.8,20.6,15.0,100381.0,0.07,0.16,2.13
75%,67430.0,42.8,22.2,12.1,23217.0,25830.0,2.58,66.2,25.0,21.9,135466.0,0.72,0.78,2.57
max,10240500.0,56.6,61.6,26.5,51818.0,3292577.0,3.71,81.9,74.2,65.9,815417.0,6.07,6.42,4.18


In [7]:
# checking if the data is balanced
df.groupby("Winner").size()

Winner
Barack Obama    264
Mitt Romney     949
dtype: int64

## Binary Encode the Winner column

In [8]:
# Binary encode the the winner column
le = preprocessing.LabelEncoder()
le.fit(df['Winner'])

LabelEncoder()

In [9]:
df['Winner'] = le.transform(df['Winner'].copy())
le.classes_

array(['Barack Obama', 'Mitt Romney'], dtype=object)

In [10]:
# Checking for the encoded version of the tag column
df['Winner'].unique()

array([0, 1])

## Performing a train test split

In [11]:
# Importing the preparation libraries
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split

In [12]:
# Splitting up the feature vs label columns
features_col = [i for i in df.columns if i != "Winner"]

# Changing the data set into numpy array for preparation
X = np.array(df[features_col])
y = np.array(df['Winner'])

In [13]:
# Perform a simple train test split
X_train, X_test, y_train, y_test = train_test_split(
                                                X, 
                                                y, 
                                                random_state=1,
                                                shuffle=True,
                                                stratify=y
                                            )

In [14]:
# # Setting up the Stratified Shuffle Split
# sss = StratifiedShuffleSplit(n_splits=7, test_size=0.5, random_state=0)
# scaler = StandardScaler()

In [15]:
# # Use the shuffle
# for train_index, test_index in sss.split(X, y):
#     X_train, X_test = X[train_index], X[test_index]
#     y_train, y_test = y[train_index], y[test_index]
#     print(len(y_train))
#     print(y_train.sum())
#     print(train_index[:5])
#     print('-'*20)

In [16]:
from imblearn.over_sampling import SMOTE

X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy=1.0).fit_resample(
                                                        X_train, y_train
                                                    )

In [21]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_resampled)

StandardScaler()

In [22]:
X_resampled_standardized = scaler.transform(X_resampled)

In [23]:
len(X_resampled_standardized)

1422

In [24]:
from sklearn.svm import SVC

model = SVC(kernel='linear')
# model.fit(X_resampled_standardized, y_resampled)

In [None]:
model.fit(X_train, y_train)

In [None]:
from imblearn.metrics import classification_report_imbalanced

y_pred = model.predict(X_test)
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
from sklearn.linear_model import 