In [1]:
import geopandas as gpd
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import xlrd as xlrd

In [5]:
fema = gpd.read_file("Data\\FEMA_6\\FEMA_6.shp")
fema = fema.sort_values(by='CTFIPS')

In [43]:
def ag_count(state):
    data = gpd.read_file("Data/SoVI2010_" + state + "/SoVI0610_" + state + ".shp")
    
    ## Creating CTFIPS, which FEMA data uses for counties
    data['CTFIPS'] = data['STATEFP10'] + data['COUNTYFP10']
    
    ## Aggregating SOVI data from block to county level
    data = data.groupby('CTFIPS').agg({"SOVI0610" + state: "mean", "P0010001": "sum"})
   
    ## Combining fema and sovi data for each county
    data = fema.merge(data, on="CTFIPS", how="right")
    
    ## Renaming the columns to things that make sense
    data = data.rename(columns={
        "SOVI0610" + state: "SOVI",
        "P0010001": "POPULATION",
        "TOTAL_DECL": "TOTAL_DECLARATIONS"
    })
    
    data = data.sort_values(by='CTFIPS')
    
    ## Creating a scale of the SOVI and FEMA data from 1 - number of counties in a state
    data['SOVI_SCORE'] = data['SOVI'].rank()
    data['FEMA_SCORE'] = data['TOTAL_DECLARATIONS'].rank()
    sovi = []
    fema_ranks = []
    rank = []
    n = len(data)
    
    ## Taking the SOV_SCORE and FEMA_SCORE, assign a SOVI_RANK and FEMA_RANK
    ## Rank = the correlation of SOV_SCORE and FEMA_SCORE, matching highs and lows and finding where it is different
    for index, row in data.iterrows():
        if row.SOVI_SCORE < (n/3):
            ## Low - Low = 1
            if row.FEMA_SCORE < (n/3):
                rank.append(1)
                fema_ranks.append(1)
            ## Low - High = 2
            elif row.FEMA_SCORE > (2 * (n/3)):
                rank.append(2)
                fema_ranks.append(3)
            ## Medium, don't care for rank
            else:
                rank.append(-1)
                fema_ranks.append(2)
            sovi.append(1)
        elif row.SOVI_SCORE > (2 * (n/3)):
            ## High - High = 4
            if row.FEMA_SCORE > (2 * (n/3)):
                rank.append(4)
                fema_ranks.append(3)
            ## High - Low = 3
            elif row.FEMA_SCORE < (n/3):
                rank.append(3)
                fema_ranks.append(1)
            ## Medium, don't care for rank
            else:
                rank.append(-1)
                fema_ranks.append(2)
            sovi.append(3)
        else:
            rank.append(-1)
            sovi.append(2)
            if row.FEMA_SCORE > (2 * (n/3)):
                fema_ranks.append(3)
            elif row.FEMA_SCORE < (n/3):
                fema_ranks.append(1)
            else:
                fema_ranks.append(2)
    ## Attaching the ranks to dataframe
    data['RANK'] = rank
    data['SOVI_RANK'] = sovi
    data['FEMA_RANK'] = fema_ranks
    return data

CA = ag_count("CA")
FL = ag_count("FL")
IN = ag_count("IN")
LA = ag_count("LA")
ME = ag_count("ME")
WA = ag_count("WA")

    OBJECTID STFIPS ST_ABBR CTFIPS       STATE           COUNTY GEO_SUFFIX  \
0         58     06      CA  06001  California          Alameda     County   
1         39     06      CA  06003  California           Alpine     County   
2         21     06      CA  06005  California           Amador     County   
3         53     06      CA  06007  California            Butte     County   
4         22     06      CA  06009  California        Calaveras     County   
5         16     06      CA  06011  California           Colusa     County   
6         50     06      CA  06013  California     Contra Costa     County   
7        111     06      CA  06015  California        Del Norte     County   
8         56     06      CA  06017  California        El Dorado     County   
9         38     06      CA  06019  California           Fresno     County   
10        41     06      CA  06021  California            Glenn     County   
11       114     06      CA  06023  California         Humboldt 

In [44]:
## Combining all of the states into one dataframe
## The rank columns don't show because there is 52 columns, so they are hidden
sovi = CA.append([FL, IN, LA, ME, WA])
sovi = sovi.sort_values(by='CTFIPS')
sovi['RANK']


0    -1
1    -1
2     3
3    -1
4     3
5    -1
6    -1
7    -1
8     1
9     3
10   -1
11    4
12    3
13    3
14   -1
15   -1
16   -1
17   -1
18   -1
19   -1
20    2
21    3
22    4
23   -1
24    3
25    1
26    2
27    2
28   -1
29    2
     ..
9    -1
10   -1
11   -1
12   -1
13    4
14   -1
15   -1
16    2
17   -1
18   -1
19   -1
20   -1
21   -1
22    4
23    3
24    4
25   -1
26   -1
27   -1
28   -1
29   -1
30    2
31   -1
32   -1
33    2
34    4
35   -1
36    2
37   -1
38   -1
Name: RANK, Length: 336, dtype: int64

In [19]:
poverty = pd.read_excel("Data\\PovertyRate6States_Counties.xlsx")
#poverty.head()

In [29]:
convert_poverty = {'CTFIPS': str}

poverty = poverty.astype(convert_poverty)
#print(poverty.dtypes)
#print(sovi.dtypes)

poverty = sovi.merge(poverty, on="CTFIPS", how="right")
poverty = poverty.sort_values(by='CTFIPS')
#poverty.columns

In [42]:
X = poverty['RANK']
X.dropna()
#Y = poverty['PERCENT']
#Y.dropna()

0     -1.0
1      1.0
2     -1.0
3      1.0
4     -1.0
5     -1.0
6     -1.0
7      4.0
8     -1.0
9      1.0
10     4.0
11    -1.0
12    -1.0
13    -1.0
14    -1.0
15    -1.0
16     3.0
17     2.0
18     3.0
19    -1.0
20    -1.0
21     2.0
22    -1.0
23     1.0
24    -1.0
25    -1.0
26    -1.0
27    -1.0
28    -1.0
29     3.0
      ... 
248   -1.0
249   -1.0
250   -1.0
251   -1.0
252    4.0
253   -1.0
254   -1.0
255    2.0
256   -1.0
257   -1.0
258   -1.0
259   -1.0
260   -1.0
261    4.0
262    3.0
263    4.0
264   -1.0
265   -1.0
266   -1.0
267   -1.0
268   -1.0
269    2.0
270   -1.0
271   -1.0
272    2.0
273    4.0
274   -1.0
275    2.0
276   -1.0
277   -1.0
Name: RANK, Length: 278, dtype: float64

In [36]:
from sklearn import linear_model
lm = linear_model.LinearRegression()
model = lm.fit(X.dropna, Y.dropna)

ValueError: Expected 2D array, got scalar array instead:
array=<bound method Series.dropna of 0     -1.0
1      1.0
2     -1.0
3      1.0
4     -1.0
5     -1.0
6     -1.0
7      4.0
8     -1.0
9      1.0
10     4.0
11    -1.0
12    -1.0
13    -1.0
14    -1.0
15    -1.0
16     3.0
17     2.0
18     3.0
19    -1.0
20    -1.0
21     2.0
22    -1.0
23     1.0
24    -1.0
25    -1.0
26    -1.0
27    -1.0
28    -1.0
29     3.0
      ... 
306    NaN
307    NaN
308    NaN
309    NaN
310    NaN
311    NaN
312    NaN
313    NaN
314    NaN
315    NaN
316    NaN
317    NaN
318    NaN
319    NaN
320    NaN
321    NaN
322    NaN
323    NaN
324    NaN
325    NaN
326    NaN
327    NaN
328    NaN
329    NaN
330    NaN
331    NaN
332    NaN
333    NaN
334    NaN
335    NaN
Name: RANK, Length: 336, dtype: float64>.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.