# **Module 2: Vector Data in Python**

In [None]:
import matplotlib.pyplot as plt
from shapely.geometry.polygon import Point, LineString, Polygon
import geopandas as gpd
import pandas as pd
import seaborn as sns
import random
import numpy as np
sns.set_style("whitegrid")

## **Exercices**
### Data
- `gw_provinces_extra.shp` - Minnesota Groundwater Provinces shapefile;
- `mn_rivers.shp` - Minnesota river shapefile;
- `soil_samp.shp` - Soil temperature stations shapefile.

**Question 1. Find and plot all groundwater provinces in Minnesota that have `var1` equal to `"a"` and `var3` greater than 100.**

In [None]:
gw_provinces_extra_gdf = gpd.read_file("./data-module-2/gw_provinces_extra.shp")
subset = gw_provinces_extra_gdf[(gw_provinces_extra_gdf.var1 == "a")&(gw_provinces_extra_gdf.var3 > 100)]
fig, ax = plt.subplots(figsize=(14,7))
gw_provinces_extra_gdf.plot(ax=ax, color="whitesmoke", edgecolor="grey")
subset.plot(ax=ax, color="palegreen", edgecolor="grey")

**Question 2. Create a simple `DataFrame` with the following code:**
```r
random.seed(0) 
data = {"ProvID": list(range(1,7)),
        "var4":["yes", "no", "no", "yes", "np", "yes"],
        "var5": random.sample(range(10, 50), 6)}
gw_prov_df = pd.DataFrame(data)
```
**Merge `gw_provinces_extra_gdf` with `gw_prov_df` into a new object called `gw_prov_stats`. Which columns were used as keys for join? Pay attention to the data type of these columns.**

In [None]:
random.seed(0) 
data = {"ProvID": list(range(1,7)),
        "var4": ["yes", "no", "no", "yes", "np", "yes"],
        "var5": random.sample(range(10, 50), 6)}
gw_prov_df = pd.DataFrame(data)
gw_prov_df

In [None]:
gw_provinces_extra_gdf.head()

In [None]:
print ("dtype from df is ", gw_provinces_extra_gdf["PROVINCE"].dtype)
print ("dtype from gdf is ", gw_prov_df["ProvID"].dtype)

In [None]:
gw_provinces_extra_gdf["PROVINCE"] = gw_provinces_extra_gdf["PROVINCE"].astype(int)
gw_prov_stats = gw_provinces_extra_gdf.merge(gw_prov_df, left_on="PROVINCE", right_on="ProvID")
gw_prov_stats.head()

**Question 3. Dissolve `var3` as `sum` by `var1`.**

In [None]:
gw_provinces_extra_gdf_agg = gw_provinces_extra_gdf[["var3", "geometry", "var1"]].dissolve(by="var1", aggfunc="sum")
gw_provinces_extra_gdf_agg

**Question 4. Find and plot the rivers found (even partially) in groundwater PROVINCE 5.**

In [None]:
mn_rivers_gdf = gpd.read_file("./data-module-2/mn_rivers.shp")
province5 = gw_provinces_extra_gdf[(gw_provinces_extra_gdf.PROVINCE == 5)]
rivers5 = gpd.sjoin(mn_rivers_gdf, province5, predicate="intersects")

In [None]:
fig, axs = plt.subplots(1,2, figsize=(9,9))
gw_provinces_extra_gdf.plot(ax=axs[0], edgecolor="dimgrey", facecolor="whitesmoke")
mn_rivers_gdf.plot(ax=axs[0], edgecolor="skyblue")
axs[0].set_title("Rivers in Minnesota", weight="bold")
gw_provinces_extra_gdf.plot(ax=axs[1], edgecolor="dimgrey", facecolor="whitesmoke")
province5.plot(ax=axs[1], edgecolor="dimgrey", facecolor="lightgrey")
rivers5.plot(ax=axs[1], edgecolor="skyblue")
axs[1].set_title("Rivers in Minnesota, PROVINCE 5", weight="bold")
plt.tight_layout()

**Question 5. Which groundwater province does the Lamberton (soil sample) site located in?**

In [None]:
mda_gdf = gpd.read_file("./data-module-2/soil_samp.shp")
lamberton = mda_gdf[(mda_gdf.name == "Lamberton")]
lamberton_in_prov = gpd.sjoin(lamberton, gw_provinces_extra_gdf, predicate="within")
lamberton_in_prov

**Question 6. Find the soil sample sites that are within 10 km of the Mississippi river.**

In [None]:
mississippi = mn_rivers_gdf[(mn_rivers_gdf.KITTLE_NAM == "Mississippi River")]
mississippi_buffer = mississippi.copy()
mississippi_buffer["geometry"] = mississippi.buffer(10000)
mda_gdf_within_miss = gpd.sjoin(mda_gdf, mississippi_buffer, predicate="within")
mda_gdf_within_miss

**Question 7. Find 10-km buffer for Mississippi river, the centroid for Mississippi river, and Mississippi river segments that overlap PROVINCE 4 (use `overlay` operation). Display results on a figure with 3 subplots.**

In [None]:
mississippi_centroid = mississippi.copy()
mississippi_centroid["geometry"] = mississippi.centroid
province4 = gw_provinces_extra_gdf[(gw_provinces_extra_gdf.PROVINCE == 4)]
mississippi_segments = gpd.overlay(mississippi, province4)

In [None]:
fig, axs = plt.subplots(1,3, figsize=(12,8))
mississippi.plot(ax=axs[0], edgecolor="skyblue")
mississippi_buffer.plot(ax=axs[0], color="red")
axs[0].set_title("Mississippi 10-km buffer", weight="bold")
mississippi.plot(ax=axs[1], edgecolor="skyblue")
mississippi_centroid.plot(ax=axs[1], color="red")
axs[1].set_title("Mississippi centroid", weight="bold")
mississippi.plot(ax=axs[2], edgecolor="skyblue")
mississippi_segments.plot(ax=axs[2], color="red")
axs[2].set_title("Mississippi in PROVINCE 4", weight="bold")