In [None]:
!pip install pysal
import pandas as pd
import geopandas as gpd
from pysal.lib import weights
from esda import moran
import numpy as np

# Load the datasets
lung_cancer = pd.read_csv('lung_cancer_data.csv')
pm25 = pd.read_csv('pm25_data.csv')

# Load the shapefile
parishes_shapefile = gpd.read_file('Louisiana_Parishes.shp')

# Merge the lung cancer data with the shapefile
lung_cancer = pd.merge(lung_cancer, parishes_shapefile[['Name', 'geometry']], left_on='Parish', right_on='Name')

# Merge the PM2.5 data with the shapefile
pm25 = pd.merge(pm25, parishes_shapefile[['Name', 'geometry']], left_on='Parish', right_on='Name')

# Group the PM2.5 data by parish and year, and calculate the mean
pm25['pm2.5'] = pd.to_numeric(pm25['pm2.5'], errors='coerce')
pm25_grouped = pm25.groupby(['Parish', 'Year'])['pm2.5'].mean().reset_index()

# Merge the lung cancer data with the grouped PM2.5 data
combined_data = pd.merge(lung_cancer, pm25_grouped, on=['Parish', 'Year'])

# Convert combined_data to a GeoDataFrame
combined_data = gpd.GeoDataFrame(combined_data, geometry='geometry')

#  replace occurrences of the character '^' in the 'Rate' column of the DataFrame 'lung_cancer' with NaN (Not a Number)

lung_cancer['Rate'] = lung_cancer['Rate'].replace('^', pd.np.nan)

# converting the values in the 'Rate' column of the DataFrame 'combined_data' into numeric type
combined_data['Rate'] = pd.to_numeric(combined_data['Rate'], errors='coerce')

# Drop rows with NaN values in the 'Rate' column
combined_data_no_nan = combined_data.dropna(subset=['Rate'])

# Convert 'Rate' column to float
combined_data_no_nan['Rate'] = combined_data_no_nan['Rate'].astype(float)


# Create a spatial weight matrix using combined_data_no_nan
weights_matrix = weights.Queen.from_dataframe(combined_data, ids=combined_data.index)

# Calculate Moran's I for lung cancer rate
lung_cancer_moran = moran.Moran(combined_data_no_nan['Rate'], weights_matrix_no_nan)

# Calculate Moran's I for PM2.5 value
pm25_moran = moran.Moran(combined_data['pm2.5'], weights_matrix)

print(f"Moran's I for lung cancer rate: {lung_cancer_moran.I}")
print(f"Moran's I for PM2.5 value: {pm25_moran.I}")

# Visualize the spatial distribution
import folium

# Create a base map
base_map = folium.Map(location=[combined_data.geometry.centroid.y.mean(), combined_data.geometry.centroid.x.mean()], zoom_start=8)

# Add choropleth layers for lung cancer rate and PM2.5 value
#The first choropleth map represents the lung cancer rate using a color scale ranging from yellow to red
#the second one represents PM2.5 values using a color scale ranging from purple to blue
# utilized for visualizing spatial variations in both lung cancer rates and PM2.5 values on the map, with distinct color schemes.
folium.Choropleth(
    geo_data=combined_data,
    data=combined_data,
    columns=['Parish', 'Rate'],
    key_on='feature.properties.Parish',
    fill_color='YlOrRd',
    legend_name='Lung Cancer Rate'
).add_to(base_map)

folium.Choropleth(
    geo_data=combined_data,
    data=combined_data,
    columns=['Parish', 'pm2.5'],
    key_on='feature.properties.Parish',
    fill_color='PuBu',
    legend_name='PM2.5 Value'
).add_to(base_map)

# Display the map
base_map



  lung_cancer['Rate'] = lung_cancer['Rate'].replace('^', pd.np.nan)


ValueError: dimension mismatch

**Spatial Correlation and Moran's I Calculation**

In [None]:
import pandas as pd
import geopandas as gpd
from pysal.lib import weights
from esda import moran
import numpy as np

# Load the datasets
lung_cancer = pd.read_csv('lung_cancer_data.csv')
pm25 = pd.read_csv('pm25_data.csv')

# Load the shapefile(Package of files which are used for geographic features, but in this case LA Parishes)
parishes_shapefile = gpd.read_file('Louisiana_Parishes.shp')

# Merge the lung cancer data with the shapefile
lung_cancer = pd.merge(lung_cancer, parishes_shapefile[['Name', 'geometry']], left_on='Parish', right_on='Name')

# Merge the PM2.5 data with the shapefile to combine PM2.5 data from the pm2.5 dataframe with spatial geometry from the shapefile
pm25 = pd.merge(pm25, parishes_shapefile[['Name', 'geometry']], left_on='Parish', right_on='Name')

# Group the PM2.5 data by parish and year, and calculate the mean,
# prepare the PM2.5 data for further analysis and merging with the lung cancer data.
# ensures that the pm2.5 is in numeric format
pm25['pm2.5'] = pd.to_numeric(pm25['pm2.5'], errors='coerce')
# calculates mean pm2.5 value for each parish and year combo which is utilized to explore the patterns in the pm2.5 data.
pm25_grouped = pm25.groupby(['Parish', 'Year'])['pm2.5'].mean().reset_index()

# Merge the lung cancer data with the grouped PM2.5 data
#combine lung cancer data from the lung_cancer dataframe with the mean PM2.5 values from the pm25 grouped dataframe. Combine is performed by matching parish names and years columns of both dataframes.
combined_data = pd.merge(lung_cancer, pm25_grouped, on=['Parish', 'Year'])

# Convert the combined_data to a GeoDataFrame
# The geometry column will hold the spatial geometry information (polygons representing parish boundaries) Allows for combined data to be used for visualiuzation
combined_data = gpd.GeoDataFrame(combined_data, geometry='geometry')

# Replace the '^' character with NaN in the 'Rate' column
# in some days for West BR parish we encountered some empty values, the purpose is to handle missing or invalid values in the rate column for our lung cancer dataset. Replacing the ^ with NaN(not a number) handled those missing values.
lung_cancer['Rate'] = lung_cancer['Rate'].replace('^', pd.np.nan)

# Replace non-numeric values with NaN
#converting data in rate column for lung cancer to numeric value and handing errors by coercing them(values will be replaced if they are non numeric)
combined_data['Rate'] = pd.to_numeric(combined_data['Rate'], errors='coerce')

# Drop rows with NaN values in the 'Rate' column
# create new dataframe and drop rows with NaN (not a number) value from rate column
combined_data_no_nan = combined_data.dropna(subset=['Rate'])

# convert rate column to folat, to make sure that the numerical values are in a consistent format for calculations.
combined_data_no_nan['Rate'] = combined_data_no_nan['Rate'].astype(float)


# create weight matrix based on dataframe (no NaN)
weights_matrix_no_nan = weights.Queen.from_dataframe(combined_data_no_nan, ids=combined_data_no_nan.index)

# Calculate Moran's I for lung cancer rate
# measuring spatial autocorrelation of lung cancer rate data aross 3 Parishes.
# weights_matrix_ no_nan defines spatial relationship between parishes
# combined(provides the lung cancer  rate data for each parish)
lung_cancer_moran = moran.Moran(combined_data_no_nan['Rate'], weights_matrix_no_nan)

# Calculation of Moran's I for PM2.5 and lung cancer  value then  print values for lung cancer rate and pm2.5.
pm25_moran = moran.Moran(combined_data['pm2.5'], weights_matrix)

print(f"Moran's I for lung cancer rate: {lung_cancer_moran.I}")
print(f"Moran's I for PM2.5 value: {pm25_moran.I}")

# Visualize the spatial distribution
import folium

# Create a base map using folium
base_map = folium.Map(location=[combined_data.geometry.centroid.y.mean(), combined_data.geometry.centroid.x.mean()], zoom_start=8)

# Add choropleth layers for lung cancer rate and PM2.5 value
#layers for showing lung cancer rate (yellow/red) & PM2.5 (blue/purple)
folium.Choropleth(
    geo_data=combined_data,
    data=combined_data,
    columns=['Parish', 'Rate'],
    key_on='feature.properties.Parish',
    fill_color='YlOrRd',
    legend_name='Lung Cancer Rate'
).add_to(base_map)

folium.Choropleth(
    geo_data=combined_data,
    data=combined_data,
    columns=['Parish', 'pm2.5'],
    key_on='feature.properties.Parish',
    fill_color='PuBu',
    legend_name='PM2.5 Value'
).add_to(base_map)

# display the map
base_map

  lung_cancer['Rate'] = lung_cancer['Rate'].replace('^', pd.np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
  self.seI_norm = self.VI_norm ** (1 / 2.0)
  self.seI_rand = VIR ** (1 / 2.0)


Moran's I for lung cancer rate: -0.07142857142857144
Moran's I for PM2.5 value: -0.06666666666666668
