In [1]:
import pysal as ps

# IO

Right now, we can do tabular reading in one of two ways. First is through the `pdio.read_files` command, which only processes shapefile/dbf pairs. 

In [2]:
columbus_shp = ps.examples.get_path('columbus.shp')
columbus_dbf = ps.examples.get_path('columbus.dbf')
pci = ps.examples.get_path('usjoin.csv')

In [3]:
df = ps.pdio.read_files(columbus_shp)
#ps.pdio.read_files(pci) # will fail

In [4]:
df.head()

Unnamed: 0,AREA,PERIMETER,COLUMBUS_,COLUMBUS_I,POLYID,NEIG,HOVAL,INC,CRIME,OPEN,...,DISCBD,X,Y,NSA,NSB,EW,CP,THOUS,NEIGNO,geometry
0,0.309441,2.440629,2,5,1,5,80.467003,19.531,15.72598,2.850747,...,5.03,38.799999,44.07,1.0,1.0,1.0,0.0,1000.0,1005.0,<pysal.cg.shapes.Polygon object at 0x7eff47e51...
1,0.259329,2.236939,3,1,2,1,44.567001,21.232,18.801754,5.29672,...,4.27,35.619999,42.380001,1.0,1.0,0.0,0.0,1000.0,1001.0,<pysal.cg.shapes.Polygon object at 0x7eff47e51...
2,0.192468,2.187547,4,6,3,6,26.35,15.956,30.626781,4.534649,...,3.89,39.82,41.18,1.0,1.0,1.0,0.0,1000.0,1006.0,<pysal.cg.shapes.Polygon object at 0x7eff47e51...
3,0.083841,1.427635,5,2,4,2,33.200001,4.477,32.38776,0.394427,...,3.7,36.5,40.52,1.0,1.0,0.0,0.0,1000.0,1002.0,<pysal.cg.shapes.Polygon object at 0x7eff47e51...
4,0.488888,2.997133,6,7,5,7,23.225,11.252,50.73151,0.405664,...,2.83,40.009998,38.0,1.0,1.0,1.0,0.0,1000.0,1007.0,<pysal.cg.shapes.Polygon object at 0x7eff47e51...


Also, any of our file handlers that inherit from DataTable now have a `to_df()` method:

In [5]:
df2 = ps.open(columbus_dbf).to_df()

**Remaining: ** option to pair this to_df with a shp2series call

In [6]:
df2['geometry'] = ps.pdio.shp.shp2series(columbus_shp)

In [7]:
df2.head()

Unnamed: 0,AREA,PERIMETER,COLUMBUS_,COLUMBUS_I,POLYID,NEIG,HOVAL,INC,CRIME,OPEN,...,DISCBD,X,Y,NSA,NSB,EW,CP,THOUS,NEIGNO,geometry
0,0.309441,2.440629,2,5,1,5,80.467003,19.531,15.72598,2.850747,...,5.03,38.799999,44.07,1.0,1.0,1.0,0.0,1000.0,1005.0,<pysal.cg.shapes.Polygon object at 0x7eff47bab...
1,0.259329,2.236939,3,1,2,1,44.567001,21.232,18.801754,5.29672,...,4.27,35.619999,42.380001,1.0,1.0,0.0,0.0,1000.0,1001.0,<pysal.cg.shapes.Polygon object at 0x7eff47bab...
2,0.192468,2.187547,4,6,3,6,26.35,15.956,30.626781,4.534649,...,3.89,39.82,41.18,1.0,1.0,1.0,0.0,1000.0,1006.0,<pysal.cg.shapes.Polygon object at 0x7eff47bab...
3,0.083841,1.427635,5,2,4,2,33.200001,4.477,32.38776,0.394427,...,3.7,36.5,40.52,1.0,1.0,0.0,0.0,1000.0,1002.0,<pysal.cg.shapes.Polygon object at 0x7eff4e74d...
4,0.488888,2.997133,6,7,5,7,23.225,11.252,50.73151,0.405664,...,2.83,40.009998,38.0,1.0,1.0,1.0,0.0,1000.0,1007.0,<pysal.cg.shapes.Polygon object at 0x7eff47c02...


# Weights 

Weights now have a new method, `from_file`, that calls to the appropriate reader in PySAL, and `to_WSP`/`from_WSP`.

Contigutiy weights & Distance weights classes gain some new classmethods:

- (contiguity) `from_iterable`: construct contigutiy weights from an arbitrary iterbale of geo-interfaced shapes
- (distance) `from_array`: construct distance weights from an array of points
- `from_shapefile`: construct weights from a shapefile 
- `from_dataframe`: construct weights from a dataframe

In [8]:
W1 = ps.W.from_file(ps.examples.get_path('columbus.gal'))
Wref = ps.open(ps.examples.get_path('columbus.gal')).read()

In [9]:
Wref.neighbors == W1.neighbors

True

I've added a deprecation error to the constructors in `weights/user.py`, in hopes of transitioning people to using the classmethods.

Things that we might not want to transition are things like the threshold stuff. But, I think it might make sense to bake those into the classes, too. Not sure where others think that should go.

All of the classmethods are tested in unittests.

In [10]:
R1 = ps.weights.Rook.from_shapefile(columbus_shp)
Rref = ps.weights.rook_from_shapefile(columbus_shp)
R1.neighbors == Rref.neighbors

  Warn('This function is deprecated. Please use the Rook or Queen classes')


True

Notably, I brought in `@jlaura`'s optimized contiguity builder:

In [11]:
print(ps.weights._contW_lists.ContiguityWeightsLists.__doc__)


    Contiguity for a collection of polygons using high performance
    list, set, and dict containers
    


In [12]:
%timeit ps.weights.Queen.from_shapefile(columbus_shp, method='binning')

100 loops, best of 3: 10.8 ms per loop


In [13]:
%timeit ps.weights.Queen.from_shapefile(columbus_shp, method='lists')

100 loops, best of 3: 3.08 ms per loop


In [14]:
Qbin = ps.weights.Queen.from_shapefile(columbus_shp, method='binning')
Qlist = ps.weights.Queen.from_shapefile(columbus_shp, method='lists')

Of course, when we do this, we need to consider the fact that the weights bay be ordered differently.

In [15]:
map(set, Qbin.neighbors.values()) == map(set, Qlist.neighbors.values())

True

The lists implementation is enabled by default.

Distance weights also get these methods.

In [16]:
K1 = ps.weights.Kernel.from_dataframe(df)
Kref = ps.weights.kernelW_from_shapefile(columbus_shp)
print(K1.neighbors == Kref.neighbors and K1.weights == Kref.weights)

True


Since there was some dissatisfaction with recalculation of the KDTree when using KNN weights, I made them their own class and gave them a special reweight function. This could be applied to Kernel weights as well, or any function that uses the kdtree pattern. 

In [17]:
KNN = ps.weights.KNN.from_dataframe(df, k=5)
KNNref = ps.weights.knnW_from_shapefile(columbus_shp, k=5)
KNN.neighbors == KNNref.neighbors

  Warn('This function is deprecated. Please use pysal.weights.KNN')


True

In [18]:
KNN.reweight(k=9)
KNNref = ps.weights.knnW_from_shapefile(columbus_shp, k=9)
KNN.neighbors == KNNref.neighbors

True

In [19]:
new_data = ps.common.np.random.randint(0,10, size=(10,2))

In [20]:
new_KNN = KNN.reweight(new_data = new_data, k=12, inplace=False)

In [21]:
print(new_KNN.histogram, KNN.histogram)

([(12, 59)], [(9, 49)])


# Geotable

For the following, you'll need Shapely and Geopandas. 

In [22]:
from pysal.contrib.geotable import ops as GIS
import pysal.contrib.shapely_ext as she

Shapely will be to show the geotable operations, and geopandas will just be to show interop.

First, recall the dataframe

In [23]:
df.head()

Unnamed: 0,AREA,PERIMETER,COLUMBUS_,COLUMBUS_I,POLYID,NEIG,HOVAL,INC,CRIME,OPEN,...,DISCBD,X,Y,NSA,NSB,EW,CP,THOUS,NEIGNO,geometry
0,0.309441,2.440629,2,5,1,5,80.467003,19.531,15.72598,2.850747,...,5.03,38.799999,44.07,1.0,1.0,1.0,0.0,1000.0,1005.0,<pysal.cg.shapes.Polygon object at 0x7eff47e51...
1,0.259329,2.236939,3,1,2,1,44.567001,21.232,18.801754,5.29672,...,4.27,35.619999,42.380001,1.0,1.0,0.0,0.0,1000.0,1001.0,<pysal.cg.shapes.Polygon object at 0x7eff47e51...
2,0.192468,2.187547,4,6,3,6,26.35,15.956,30.626781,4.534649,...,3.89,39.82,41.18,1.0,1.0,1.0,0.0,1000.0,1006.0,<pysal.cg.shapes.Polygon object at 0x7eff47e51...
3,0.083841,1.427635,5,2,4,2,33.200001,4.477,32.38776,0.394427,...,3.7,36.5,40.52,1.0,1.0,0.0,0.0,1000.0,1002.0,<pysal.cg.shapes.Polygon object at 0x7eff47e51...
4,0.488888,2.997133,6,7,5,7,23.225,11.252,50.73151,0.405664,...,2.83,40.009998,38.0,1.0,1.0,1.0,0.0,1000.0,1007.0,<pysal.cg.shapes.Polygon object at 0x7eff47e51...


## Interop

We can get in and out of geopandas dataframes easily. Right now, since PySAL polygons aren't projected, we clobber crs and spatial index information coming to/from geopandas. 

In addition, the multipolygon with holes issue is not resolved, and we will generate invalid polygons when processing multipolygons with holes.

In [24]:
gdf = GIS.tabular.to_gdf(df)

In [25]:
type(gdf)

geopandas.geodataframe.GeoDataFrame

In [26]:
df = GIS.tabular.to_df(gdf)

In [27]:
type(df)

pandas.core.frame.DataFrame

In [28]:
type(df.geometry[0])

pysal.cg.shapes.Polygon

I tried to make all of the weights construction operation somewhat monadic. So, consider the following equality:

In [29]:
(ps.weights.Rook.from_dataframe(gdf).neighbors == 
 ps.weights.Rook.from_dataframe(df).neighbors)

True

I only convert to PySAL shapes if needed, and I do that exactly once: in `from_iterable`. So, all methods "aim" at generating an iterable of PySAL shapes. In Haskell-like type notation:

`from_shapefile :: string -> File Handler -> Just Polygons  -> W`

`from_dataframe :: DataFrame -> Maybe Polygons -> Just Polygons -> W`

`from_iterable :: Maybe Polygons -> Just Polygons -> W`

`Rook.__init__ :: Just Polygons -> W`

So, you can see that the "higher-level" constructors are just chained transforms to correctly call `Rook.__init__`. `asShape` does the casting from `Maybe` to `Just`, and only actually *does* anything if it encounters something a polygon that's not a PySAL polygon.

This means that, if `asShape` can get it to a PySAL shape, then any iterable can be weighted by `from_iterable`. 

Now, a good example of the GIS might be the atomic operands. is module contains all of the attributes on PySAL shapes and the atomic `shapely_ext` operations. 

In [30]:
print([x for x in GIS.atomic.__dict__ if not x.startswith('_')])

['relate', 'within', 'is_simple', 'bounding_box', 'cascaded_union', 'touches', 'intersection', 'area', 'union', 'segments', 'contains', 'representative_point', 'is_valid', 'symmetric_difference', 'get_attr', 'overlaps', 'unary_union', 'almost_equals', 'envelope', 'crosses', 'bbox', 'difference', 'distance', 'equals_exact', 'bounds', 'project', 'intersects', 'disjoint', 'convex_hull', 'buffer', 'parts', 'centroid', 'boundary', 'perimeter', 'equals', 'is_ring', 'simplify', 'is_empty', 'interpolate', 'has_z', 'to_wkb', 'k', 'holes', 'vertices', 'len', 'length', 'to_wkt']


All of the `atomic` operations derive from the same pattern:

In [85]:
GIS.atomic._atomic_op(df.head(), geom_col='geometry', inplace=False, _func=she.area) #, **kw)

0    0.309440
1    0.259328
2    0.192469
3    0.083842
4    0.488887
Name: geometry, dtype: float64

Everything in `atomic` is done over the geometry column using `df.geometry.apply`. 

If you're interested in computing a *relational* atomic operation, like something in the DE9IM, you have to specify the `other` shape, which gets passed through the `**kw` to the underlying shapely operation.

In [95]:
GIS.atomic.touches(df.head(), other=df.geometry[0])

0    False
1     True
2     True
3    False
4    False
Name: geometry, dtype: bool

This would build the basics of how we could do spatial join without using the GEOS ops, if we can make fast indexes & fast touches. 

But, right now, we can spatial join using geopandas, and get back the pysal frames we need. If you plan on doing a ton of these ops, you should probably keep the frame in GeoPandas.

First, we'll need to construct a point dataset. I'm doing this now, but if you had a point-based dataset, you could would skip these steps

In [31]:
df_centroids = df.drop([col for col in df.columns if col not in ['geometry', 'POLYID']],axis=1)

In [32]:
df_centroids.head()

Unnamed: 0,POLYID,geometry
0,1,<pysal.cg.shapes.Polygon object at 0x7eff46c76...
1,2,<pysal.cg.shapes.Polygon object at 0x7eff47c02...
2,3,<pysal.cg.shapes.Polygon object at 0x7eff47bab...
3,4,<pysal.cg.shapes.Polygon object at 0x7eff40819...
4,5,<pysal.cg.shapes.Polygon object at 0x7eff4082a...


In [33]:
df_centroids['geometry'] = GIS.atomic.centroid(df_centroids)
df_centroids['geometry'] = df_centroids.geometry.apply(ps.cg.Point)
df_centroids['special_data'] = ps.common.np.random.normal(0,1,size=(df_centroids.shape[0],1))

Now, we can merge the two pandas dataframes with geometry columns by using the `tabular.spatial_join` function, which dispatches to `geopandas.tools.sjoin`. 

In [34]:
joined_df = GIS.tabular.spatial_join(df, df_centroids)

In [35]:
type(joined_df)

pandas.core.frame.DataFrame

In [36]:
joined_df.head()[['geometry', 'special_data', 'POLYID_left']]

Unnamed: 0,geometry,special_data,POLYID_left
0,<pysal.cg.shapes.Polygon object at 0x7eff407d6...,-0.462174,1
1,<pysal.cg.shapes.Polygon object at 0x7eff407dc...,-0.159133,2
2,<pysal.cg.shapes.Polygon object at 0x7eff40799...,0.62066,3
3,<pysal.cg.shapes.Polygon object at 0x7eff40823...,1.512632,4
4,<pysal.cg.shapes.Polygon object at 0x7eff4080f...,0.089255,5


### Other semi-Tabular operatoins

Reductions, like dissolves, might seem like they should be tabular. And, I think it might be a good idea to have "cascaded union" remain in `atomic`, but put wrappers around the reductions in `tabular`. So, something like:

In [39]:
GIS.atomic.cascaded_union(df.groupby('NSB'))

NSB
0.0    <pysal.cg.shapes.Polygon object at 0x7eff4073e...
1.0    <pysal.cg.shapes.Polygon object at 0x7eff4070c...
Name: geometry, dtype: object

**Remaining:** wrap this and put it in `tabular`.

In [87]:
def dissolve(df, on='', geom_col='geometry', **grouper_kws):
    return GIS.atomic.cascaded_union(df.groupby(on, **grouper_kws))

In [88]:
dissolve(df, on='NSB')

NSB
0.0    <pysal.cg.shapes.Polygon object at 0x7eff40473...
1.0    <pysal.cg.shapes.Polygon object at 0x7eff403a3...
Name: geometry, dtype: object

**Remaining:** implement `cascaded_intersection`, which isn't too difficult, but poses some challenges when considering how to handle the fact that shapely returns an empty geometry collection on an empty intersection, instead of respecting the input shape types. Should be pretty straightforward

In [89]:
import pysal.contrib.shapely_ext as she
def cascaded_intersection(shapes):
    iterator = iter(shapes)
    outshape = next(iterator)
    for i, shape in enumerate(iterator):
        print('on shape {}'.format(i))
        outshape = she.intersection(outshape, shape)
        if outshape.is_empty:
            # no further intersections will yield nonempty shapes
            return outshape
    return outshape

In [90]:
a,b = df.groupby('NSB').geometry.apply(list)

In [91]:
try:
    cascaded_intersection(a)
except NotImplementedError as e:
    print('failed! \n{}'.format(e))

on shape 0
failed! 
geometrycollection is not supported at this time.


In [92]:
sha = she.shapely.geometry.asShape(a[0])
shb = she.shapely.geometry.asShape(a[1])

In [93]:
sha.intersection(shb).is_empty

True

In [94]:
type(sha.intersection(shb))

shapely.geometry.collection.GeometryCollection