# Logistic Regression Model

Train Logistic Regression models on features_v3 (normalized dataset with state dummy variables) on different downsampling rates on republican counties since the dataset is imbalanced.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import joblib
import warnings
warnings.filterwarnings("ignore")

In [2]:
# import dataset

data = pd.read_csv('features_v3.csv')
data

Unnamed: 0,party,totalpop,income,service,office,drive,carpool,transit,workathome,meancommute,...,SD,TN,TX,UT,VA,VT,WA,WI,WV,WY
0,0.0,1.000000,0.354074,0.446203,0.654952,0.755932,0.331104,0.110211,0.137097,0.593023,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.521624,0.344999,0.417722,0.648562,0.632768,0.290970,0.298217,0.112903,0.659884,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.433955,0.337373,0.392405,0.619808,0.825989,0.371237,0.047002,0.088710,0.540698,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.400262,0.335184,0.424051,0.738019,0.795480,0.367893,0.038898,0.158602,0.462209,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.262875,0.228581,0.506329,0.769968,0.800000,0.307692,0.089141,0.115591,0.590116,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3040,1.0,0.000017,0.335923,0.170886,0.760383,0.467797,0.324415,0.000000,0.607527,0.651163,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3041,1.0,0.000063,0.137758,0.493671,0.252396,0.696045,0.521739,0.000000,0.102151,0.215116,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3042,1.0,0.000018,0.192528,0.186709,0.000000,0.520904,0.521739,0.000000,0.534946,0.287791,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3043,0.0,0.000030,0.164322,0.920886,0.523962,1.000000,0.000000,0.000000,0.000000,0.203488,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
def generate_dataset(downsample_rate):
    '''
    generate dataset with republican county downsampling
    '''
    
    # separate dataset by counties
    dem_df = data[data.party == 0]
    rep_df = data[data.party == 1]

    # num republican counties to sample
    rep_df = rep_df.sample(frac=downsample_rate)
    
    #concat dem and rep dfs
    final_df = pd.concat([dem_df, rep_df], 0)
    
    return final_df

In [4]:
# train model on different downsample rates for republican counties and record accuracy scores

downsample_rate_list = [0.05, 0.15, 0.25, 0.50, 0.75, 1.0]

score_df = pd.DataFrame(columns = ['downsample_rate', 'score'])
for rate in downsample_rate_list:

    print('Training with downsample rate ' + str(rate))
    final_df = generate_dataset(rate)

    X = final_df[['totalpop', 'income', 'service', 'office', 'drive', 'carpool',
           'transit', 'workathome', 'meancommute', 'unemployment',
           'perc_men', 'perc_white', 'perc_private_work', 'perc_citizen', 'AL',
           'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI', 'IA', 'ID',
           'IL', 'IN', 'KS', 'KY', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT',
           'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA',
           'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY']]
    y = final_df['party']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    model = LogisticRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = model.score(X_test, y_test)
    
    score_df = score_df.append(pd.DataFrame({'downsample_rate': [rate], 'score': [score]}), 0)
    
score_df

Training with downsample rate 0.05
Training with downsample rate 0.15
Training with downsample rate 0.25
Training with downsample rate 0.5
Training with downsample rate 0.75
Training with downsample rate 1.0


Unnamed: 0,downsample_rate,score
0,0.05,0.842975
0,0.15,0.843931
0,0.25,0.892857
0,0.5,0.886364
0,0.75,0.904366
0,1.0,0.91133


Most accurate model is with 100% downsampling rate. These are preliminary accuracy scores. More evaluation will be done next.

In [16]:
# generate model again with downsampling 100%

final_df = generate_dataset(1)

X = final_df[['totalpop', 'income', 'service', 'office', 'drive', 'carpool',
       'transit', 'workathome', 'meancommute', 'unemployment',
       'perc_men', 'perc_white', 'perc_private_work', 'perc_citizen', 'AL',
       'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI', 'IA', 'ID',
       'IL', 'IN', 'KS', 'KY', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT',
       'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA',
       'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY']]
y = final_df['party']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
score = model.score(X_test, y_test)
score

0.9261083743842364

The above score is a measure of accuracy or the true positive rate plus true negative rate. 92% accuracy is a good measure, so this is our final model.

In [17]:
# save model

filename = 'finalized_election_model.sav'
joblib.dump(model, filename)

['finalized_election_model.sav']