In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()  #if you want to use seaborn themes with matplotlib functions
import warnings
warnings.filterwarnings('ignore')
rand_state = 1000

In [2]:
df = pd.read_csv('gubernatorial_data_semiraw.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41101 entries, 0 to 41100
Data columns (total 27 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   geoid             41101 non-null  int64  
 1   state             41101 non-null  object 
 2   DesignatedOZ      7801 non-null   float64
 3   county            41101 non-null  object 
 4   Type              41101 non-null  object 
 5   dec_score         40825 non-null  float64
 6   SE_Flag           1012 non-null   float64
 7   Population        41068 non-null  float64
 8   medhhincome       40954 non-null  float64
 9   PovertyRate       41058 non-null  float64
 10  unemprate         41058 non-null  float64
 11  medvalue          40106 non-null  float64
 12  medrent           40844 non-null  float64
 13  pctown            41027 non-null  float64
 14  severerentburden  41012 non-null  float64
 15  vacancyrate       41032 non-null  float64
 16  pctwhite          41068 non-null  float6

In [4]:
for col in df.columns:
    print(col, df[col].nunique())

geoid 41101
state 50
DesignatedOZ 1
county 1769
Type 2
dec_score 10
SE_Flag 1
Population 8154
medhhincome 24231
PovertyRate 96
unemprate 63
medvalue 5403
medrent 1818
pctown 101
severerentburden 85
vacancyrate 93
pctwhite 101
pctBlack 101
pctHispanic 101
pctAAPIalone 88
pctunder18 63
pctover64 90
HSorlower 97
BAorhigher 97
Metro 1
Micro 1
NoCBSAType 1


In [5]:
df.isna().sum() 

geoid                   0
state                   0
DesignatedOZ        33300
county                  0
Type                    0
dec_score             276
SE_Flag             40089
Population             33
medhhincome           147
PovertyRate            43
unemprate              43
medvalue              995
medrent               257
pctown                 74
severerentburden       89
vacancyrate            69
pctwhite               33
pctBlack               33
pctHispanic            33
pctAAPIalone           33
pctunder18             33
pctover64              33
HSorlower              34
BAorhigher             34
Metro                8975
Micro               36403
NoCBSAType          36824
dtype: int64

In [6]:
df['repubstate'] = 0

In [7]:
repubstates = ['Alabama', 'Arizona', 'Arkansas', 'Florida', 'Georgia', 'Idaho', 'Indiana', 'Illinois', 'Iowa', 'Kansas', 'Kentucky', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Mississippi', 'Missouri', 'Nebraska', 'Nevada', 'New Hampshire', 'New Mexico', 'North Dakota', 'Ohio', 'Oklahoma', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'West Virginia', 'Wisconsin', 'Wyoming']

In [8]:
for x in range(0, len(df['state'])):
    if df['state'][x] in repubstates:
        df['repubstate'][x] = 1 

In [9]:
df['DesignatedOZ'] = df['DesignatedOZ'].fillna(0)

In [10]:
df['Metro'] = df['Metro'].fillna(0)

In [11]:
df['Micro'] = df['Micro'].fillna(0)

In [12]:
df['NoCBSAType'] = df['NoCBSAType'].fillna(0)

In [13]:
df['SE_Flag'] = df['SE_Flag'].fillna(0)

In [14]:
for col in ['state', 'county', 'Type', 'DesignatedOZ', 'dec_score', 'SE_Flag', 'Metro', 'Micro', 'NoCBSAType']:
    df[col]= df[col].astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41101 entries, 0 to 41100
Data columns (total 28 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   geoid             41101 non-null  int64   
 1   state             41101 non-null  category
 2   DesignatedOZ      41101 non-null  category
 3   county            41101 non-null  category
 4   Type              41101 non-null  category
 5   dec_score         40825 non-null  category
 6   SE_Flag           41101 non-null  category
 7   Population        41068 non-null  float64 
 8   medhhincome       40954 non-null  float64 
 9   PovertyRate       41058 non-null  float64 
 10  unemprate         41058 non-null  float64 
 11  medvalue          40106 non-null  float64 
 12  medrent           40844 non-null  float64 
 13  pctown            41027 non-null  float64 
 14  severerentburden  41012 non-null  float64 
 15  vacancyrate       41032 non-null  float64 
 16  pctwhite          4106

In [15]:
df.head()

Unnamed: 0,geoid,state,DesignatedOZ,county,Type,dec_score,SE_Flag,Population,medhhincome,PovertyRate,...,pctHispanic,pctAAPIalone,pctunder18,pctover64,HSorlower,BAorhigher,Metro,Micro,NoCBSAType,repubstate
0,2198000300,Alaska,0.0,Prince of Wales-Hyder Census Area,Low-Income Community,,0.0,,,,...,,,,,,,0.0,0.0,1.0,0
1,6037980014,California,1.0,Los Angeles County,Low-Income Community,9.0,0.0,,,,...,,,,,,,1.0,0.0,0.0,0
2,6037980004,California,0.0,Los Angeles County,Low-Income Community,1.0,0.0,,,,...,,,,,,,1.0,0.0,0.0,0
3,6037980021,California,0.0,Los Angeles County,Low-Income Community,,0.0,,,,...,,,,,,,1.0,0.0,0.0,0
4,12086980800,Florida,1.0,Miami-Dade County,Low-Income Community,8.0,0.0,,,,...,,,,,,,1.0,0.0,0.0,1


In [16]:
df.isna().sum() 

geoid                 0
state                 0
DesignatedOZ          0
county                0
Type                  0
dec_score           276
SE_Flag               0
Population           33
medhhincome         147
PovertyRate          43
unemprate            43
medvalue            995
medrent             257
pctown               74
severerentburden     89
vacancyrate          69
pctwhite             33
pctBlack             33
pctHispanic          33
pctAAPIalone         33
pctunder18           33
pctover64            33
HSorlower            34
BAorhigher           34
Metro                 0
Micro                 0
NoCBSAType            0
repubstate            0
dtype: int64

In [17]:
df.shape

(41101, 28)

In [18]:
df = df.dropna()

In [19]:
df.shape

(39897, 28)

In [20]:
df.isna().sum() 

geoid               0
state               0
DesignatedOZ        0
county              0
Type                0
dec_score           0
SE_Flag             0
Population          0
medhhincome         0
PovertyRate         0
unemprate           0
medvalue            0
medrent             0
pctown              0
severerentburden    0
vacancyrate         0
pctwhite            0
pctBlack            0
pctHispanic         0
pctAAPIalone        0
pctunder18          0
pctover64           0
HSorlower           0
BAorhigher          0
Metro               0
Micro               0
NoCBSAType          0
repubstate          0
dtype: int64

In [22]:
df = pd.get_dummies(drop_first=True, data=df, columns = ['state', 'dec_score'])

In [23]:
for col in df.columns:
    print(col, df[col].nunique())

geoid 39897
DesignatedOZ 2
county 1764
Type 2
SE_Flag 2
Population 7981
medhhincome 23731
PovertyRate 87
unemprate 52
medvalue 5393
medrent 1749
pctown 98
severerentburden 81
vacancyrate 89
pctwhite 101
pctBlack 101
pctHispanic 101
pctAAPIalone 87
pctunder18 62
pctover64 89
HSorlower 96
BAorhigher 95
Metro 2
Micro 2
NoCBSAType 2
repubstate 2
state_Alaska 2
state_Arizona 2
state_Arkansas 2
state_California 2
state_Colorado 2
state_Connecticut 2
state_Delaware 2
state_Florida 2
state_Georgia 2
state_Hawaii 2
state_Idaho 2
state_Illinois 2
state_Indiana 2
state_Iowa 2
state_Kansas 2
state_Kentucky 2
state_Louisiana 2
state_Maine 2
state_Maryland 2
state_Massachusetts 2
state_Michigan 2
state_Minnesota 2
state_Mississippi 2
state_Missouri 2
state_Montana 2
state_Nebraska 2
state_Nevada 2
state_New Hampshire 2
state_New Jersey 2
state_New Mexico 2
state_New York 2
state_North Carolina 2
state_North Dakota 2
state_Ohio 2
state_Oklahoma 2
state_Oregon 2
state_Pennsylvania 2
state_Rhode Island

In [24]:
df.drop(df[df['Type'] == 'Non-LIC Contiguous'].index, inplace=True)

In [25]:
df.shape

(29754, 84)

In [26]:
df = df.drop(['county'], axis=1)

In [27]:
df.head()

Unnamed: 0,geoid,DesignatedOZ,Type,SE_Flag,Population,medhhincome,PovertyRate,unemprate,medvalue,medrent,...,state_Wyoming,dec_score_2.0,dec_score_3.0,dec_score_4.0,dec_score_5.0,dec_score_6.0,dec_score_7.0,dec_score_8.0,dec_score_9.0,dec_score_10.0
22,36005046201,1.0,Low-Income Community,0.0,29256.0,45770.0,0.07,0.09,24900.0,937.0,...,0,1,0,0,0,0,0,0,0,0
25,6107001003,1.0,Low-Income Community,0.0,22748.0,80068.0,0.21,0.05,272800.0,1083.0,...,0,0,0,0,0,0,0,0,1,0
30,13089023418,0.0,Low-Income Community,0.0,20092.0,49930.0,0.17,0.12,125100.0,993.0,...,0,0,1,0,0,0,0,0,0,0
33,12097041100,1.0,Low-Income Community,0.0,19834.0,36756.0,0.24,0.15,115600.0,1118.0,...,0,0,0,0,0,0,0,0,0,0
34,12097042900,1.0,Low-Income Community,0.0,19187.0,43750.0,0.17,0.06,150600.0,1079.0,...,0,0,0,0,0,0,1,0,0,0


In [110]:
###df.to_csv('thesis_clean.csv', index=False)