In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import geopandas as gpd
import seaborn as sb
plt.rcParams['figure.figsize'] = (20, 12)

# ETL (extract, transform, load)

## load csv, shp file

In [None]:
#load into main df_csv
df_csv = pd.read_csv('rm_crop_yields_1938_2021.csv')

In [None]:
df_csv.info()

In [None]:
df_csv.rename(columns={"Winter Wheat": "WinterWheat", "Spring Wheat": "SpringWheat","Fall Rye":"FallRye",\
             "Canary Seed":"CanarySeed","Spring Rye":"SpringRye","Tame Hay":"TameHay"},inplace=True)

In [None]:
#https://saskpulse.com
#https://www.rayglen.com/grain-conversion-calculator/
# (lbs/ac) -> Mustard (50 lbs/bu), Sunflower (30 lbs/bu), Lentils (60 lbs/bu), 
#             Canary Seeed (50 lbs/bu), Chickpeas (60lb/bu)

# Tame Hay (tons/ac)
# All the rest are bushel/acre (bu/ac)
df_csv['Mustard']=df_csv['Mustard']/50
df_csv['Sunflowers']=df_csv['Sunflowers']/30
df_csv['Lentils']=df_csv['Lentils']/60
df_csv['CanarySeed']=df_csv['CanarySeed']/50
df_csv['Chickpeas']=df_csv['Chickpeas']/60

In [None]:
#load shp data
gdf = gpd.read_file('Rural Municipality.shp')

#drop columns that won't be using
gdf.drop(['PPID','EFFDT','EXPDT','FEATURECD','SHAPE_AREA','SHAPE_LEN'],axis=1,inplace=True)

#rename column to match with main df
gdf.rename(columns=
{   'RMNO': 'RM',
    'RMNM': 'Municipality'
}, inplace=True)

#match data type between df_csv and gdf
gdf['RM']=gdf['RM'].astype('int')
gdf['Municipality']=gdf['Municipality'].astype('string')



In [None]:
gdf.info()

# EDA (Extrapolatory Data Analysis)

#### check for unique values

In [None]:
gdf['RM'].unique()
gdf['RM'].nunique()
#298 unique RM

#### check for duplicated values

In [None]:
gdf.duplicated().sum()

In [None]:
df_describe = df_csv.describe().copy()
df_describe
# Total of 25017 rows
# Year from 1938 to 2021 ~ 84 years
# 299 RM from 1 to 622 
# Spring Wheat, Barley and Oats have the most rows -> more complete data?
# Oats, Winter Wheat, Barley has the most mean -> most yield
# Tame Hay, Spring Rye, Flax have the least mean -> least yield


In [None]:
#count unique RM
df_csv['RM'].unique()
df_csv['RM'].nunique()

In [None]:
#check for duplicated rows
df_csv.duplicated().sum()

In [None]:
#check how many records there are each year
#-->not all have data for all years
temp_df= df_csv.groupby('Year').count()['RM']
temp_df = temp_df[temp_df!=299]
temp_df

In [None]:
#check for RM with less than 84 years of data
temp_df= df_csv.groupby('RM').count()['Year']
temp_df = temp_df[temp_df!=84]
temp_df

In [None]:
#Municipality with less than 84 years of data

pd.merge(temp_df,gdf,on='RM')

In [None]:
#check for RM not in geodata

temp_df= df_csv.groupby('RM').count().index
temp_df = temp_df[~temp_df.isin(gdf['RM'])]
temp_df

#278 Kutawa, Prairie No. 408, Greenfield No. 529

In [None]:
#check for geodata RM not in main data set

temp_df= df_csv.groupby('RM').count().index
temp_gdf = gdf[~gdf['RM'].isin(temp_df)]
temp_gdf


# Feature Selection

In [None]:
sb.heatmap(
    df_csv.corr(),
    annot=True)

#Strong correlation between Canola and Spring Wheat, Durum, Oats, Peas, Barley, Flax
#Strong correlation between Spring Wheat and Barley, Canola, Durun, Oats, Flax
#Strong correlation between Durum and Spring Wheat, Barley, Flax
#Strong correlation between Sunflower and Tame Hay
#Strong correlation between Oats and Spring Wheat, Durumn, Barley, Flax
#Strong correlation between Peas and Canola, Spring Wheat, Barley
#Strong correlation between Barley and Oats, Canola, Spring Wheat, Barley, Fall Rye, Flax
#Strong correlation between Fall Rye and Spring Wheat, Barley, Flax
#Strong correlation between Tame Hay and Sunflowers
#Flax and Spring Wheat, Oats, Barley, Canola, Durum

In [None]:
#Average yield per Crop across all years (mean)
fig=df_describe.iloc[1, 2:18].sort_values(axis=0).plot(kind='barh',xlabel ='Yield(bu/acre)', figsize=(4,4),title='Crop Yield (1938 - 2021)').get_figure()
fig.savefig('fig1- Crop Yield 1938 to 2021 (mean).jpg')

In [None]:
df_csv.isna().sum().plot(kind='bar')

In [None]:
temp_df= df_csv.groupby('Year').mean()
temp_df.index = pd.to_datetime(temp_df.index, format='%Y')
temp_df[['WinterWheat']].plot(xlim=[pd.Timestamp('1938-01-01'), pd.Timestamp('2021-01-01')])


In [None]:
# Total Crop Yield per Year
df_total_per_year = df_csv.copy()
df_total_per_year.drop('RM',axis=1,inplace=True)
df_total_per_year = df_total_per_year.groupby('Year').sum()
df_total_per_year['Total'] = df_total_per_year.iloc[:,1:16].sum(axis=1)

In [None]:
df_total_per_year.sort_values('Total',ascending=False)

In [None]:
fig = df_total_per_year.plot(y=["Total"], kind="bar", ylabel = 'Yield (bu/acre)', figsize=(10, 10), title = 'Total Crop Yield per Year').get_figure()
fig.savefig('fig2 - Total Crop per Year.jpg')

In [None]:
#Total Crop per Year per Crop
fig = df_total_per_year.iloc[:,0:16].plot(ylabel = 'Yield (bu/acre)', figsize=(10, 10), title = 'Crop Yield per Year').get_figure()
fig.savefig('fig3 - Crop Yield per Year.jpg')

In [None]:
df_total_mean_per_RM = df_csv.copy()
df_total_mean_per_RM = df_total_mean_per_RM.groupby(['RM']).mean()
df_total_mean_per_RM.drop('Year',axis=1,inplace=True)
df_total_mean_per_RM['Total'] = df_total_mean_per_RM.iloc[:,0:16].sum(axis=1)

fig=df_total_mean_per_RM.iloc[:,0:16].plot(ylabel = 'Yield (bu/acre)', figsize=(10, 10), title = 'Crop Yield per RM (1938 - 2021)').get_figure()
fig.savefig('Total Mean per RM.jpg')

In [None]:
df_total_sum_per_RM = df_csv.copy()
df_total_sum_per_RM = df_total_sum_per_RM.groupby(['RM']).sum()
df_total_sum_per_RM.drop('Year',axis=1,inplace=True)
df_total_sum_per_RM['Total'] = df_total_sum_per_RM.iloc[:,0:16].sum(axis=1)

fig=df_total_sum_per_RM.iloc[:,0:16].plot(ylabel = 'Yield (bu/acre)', figsize=(10, 10), title = 'Crop Yield per RM (1938 - 2021)').get_figure()
fig.savefig('Total Sum per RM.jpg')

In [None]:
#df_yr_best_crop = (df_total_per_year.iloc[:,0:16].idxmax(1)).to_frame()
#df_yr_best_crop.rename(columns={0:'Crop'},inplace=True)
#df_yr_best_crop.reset_index(inplace=True)
#df_yr_best_crop
#pd.merge(df_total_per_year,df_yr_best_crop,on='Year').plot(x='Year',y=)
#df_yr_best_crop = 

df_yr_best_crop = (df_total_per_year.iloc[:,0:16]).sort_values(by=1, ascending=False, axis=1)

## 20 year period snap shots

In [None]:
#df_total_per_RM_38to60
df_total_per_RM_38to60 = df_csv[df_csv['Year'].isin(range(1938,1960))]
df_total_per_RM_38to60 = df_total_per_RM_38to60.groupby(['RM']).mean()
df_total_per_RM_38to60.drop('Year',axis=1,inplace=True)
df_total_per_RM_38to60['Total'] = df_total_per_RM_38to60.iloc[:,0:16].sum(axis=1)


In [None]:
m=gpd.GeoDataFrame(pd.merge(
    df_total_per_RM_38to60,
    gdf,
    on='RM'
)).explore(
    column='Total', tooltip = ['Municipality','Total']
)

m.save("RMMean38to60.html")

In [None]:
#df_total_per_RM_61to80
df_total_per_RM_61to80 = df_csv[df_csv['Year'].isin(range(1961,1980))]
df_total_per_RM_61to80 = df_total_per_RM_61to80.groupby(['RM']).mean()
df_total_per_RM_61to80.drop('Year',axis=1,inplace=True)
df_total_per_RM_61to80['Total'] = df_total_per_RM_61to80.iloc[:,0:16].sum(axis=1)


In [None]:
m=gpd.GeoDataFrame(pd.merge(
    df_total_per_RM_61to80,
    gdf,
    on='RM'
)).explore(
    column='Total', tooltip = ['Municipality','Total']
)
m.save("RMMean61to80.html")

In [None]:
#df_total_per_RM_81to00
df_total_per_RM_81to00 = df_csv[df_csv['Year'].isin(range(1981,2000))]
df_total_per_RM_81to00 = df_total_per_RM_81to00.groupby(['RM']).mean()
df_total_per_RM_81to00.drop('Year',axis=1,inplace=True)
df_total_per_RM_81to00['Total'] = df_total_per_RM_81to00.iloc[:,0:16].sum(axis=1)


In [None]:
m=gpd.GeoDataFrame(pd.merge(
    df_total_per_RM_81to00,
    gdf,
    on='RM'
)).explore(
    column='Total', tooltip = ['Municipality','Total']
)
m.save("RMMean81to00.html")

In [None]:

#df_total_per_RM_01to21
df_total_per_RM_01to21 = df_csv[df_csv['Year'].isin(range(2001,2021))]
df_total_per_RM_01to21 = df_total_per_RM_01to21.groupby(['RM']).mean()
df_total_per_RM_01to21.drop('Year',axis=1,inplace=True)
df_total_per_RM_01to21['Total'] = df_total_per_RM_01to21.iloc[:,0:16].sum(axis=1)


In [None]:
m=gpd.GeoDataFrame(pd.merge(
    df_total_per_RM_01to21,
    gdf,
    on='RM'
)).explore(
    column='Total', tooltip = ['Municipality','Total']
)
m.save("RMMean01to21.html")

## 10 year period snapshot from 2001 to 2021

In [None]:
#df_total_per_RM_01to21
df_total_per_RM_01to10 = df_csv[df_csv['Year'].isin(range(2001,2010))]
df_total_per_RM_01to10 = df_total_per_RM_01to10.groupby(['RM']).mean()
df_total_per_RM_01to10.drop('Year',axis=1,inplace=True)
df_total_per_RM_01to10['Total'] = df_total_per_RM_01to10.iloc[:,0:16].sum(axis=1)


In [None]:
m=gpd.GeoDataFrame(pd.merge(
    df_total_per_RM_01to10,
    gdf,
    on='RM'
)).explore(
    column='Total', tooltip = ['Municipality','Total']
)
m.save("RMMean01to10.html")

In [None]:
#df_total_per_RM_01to21
df_total_per_RM_11to21 = df_csv[df_csv['Year'].isin(range(2011,2021))]
df_total_per_RM_11to21 = df_total_per_RM_11to21.groupby(['RM']).mean()
df_total_per_RM_11to21.drop('Year',axis=1,inplace=True)
df_total_per_RM_11to21['Total'] = df_total_per_RM_11to21.iloc[:,0:16].sum(axis=1)


In [None]:
m=gpd.GeoDataFrame(pd.merge(
    df_total_per_RM_11to21,
    gdf,
    on='RM'
)).explore(
    column='Total', tooltip = ['Municipality','Total']
)
m.save("RMMean11to21.html")

In [None]:
#fig = df_total_per_RM_11to21.plot(kind='bar', stacked=True, figsize = (16,8))

In [None]:
#df_RM_Marker = df_total_per_RM_11to21[df_total_per_RM_11to21.index.isin(df_total_per_RM_11to21.nlargest(10,'Total').index)]
#df_total_per_RM['Marker'] = df_total_per_RM['RM'].apply(lambda x: 'circle' if x in df_RM_Marker['RM'] else 'marker')


#gpd.GeoDataFrame(pd.merge(
    df_RM_Marker,
    gdf,
    on='RM'
)).explore(
    m=m, marker_type="circle")


df_total_per_RM_11to21.nlargest(10,'Total')

df_total_RM_top10 = df_total_per_RM_11to21.nlargest(10,'Total')

df_total_RM_top10.iloc[:,0:16].plot(kind='bar', stacked=True, figsize = (16,8))

df_total_per_Year_11to21 = df_csv[df_csv['Year'].isin(range(2011,2021))]
df_total_per_Year_11to21 = df_total_per_Year_11to21.groupby(['Year']).mean()
df_total_per_Year_11to21.drop('RM',axis=1,inplace=True)
df_total_per_Year_11to21['Total'] = df_total_per_Year_11to21.iloc[:,0:16].sum(axis=1)


df_total_per_Year_11to21.sum()

gpd.GeoDataFrame(pd.merge(
    df_total_RM_top10,
    gdf,
    on='RM'
)).explore(
    column='Total', tooltip = ['Municipality','Total']
)

fig=df_total_per_Year_11to21.iloc[:,0:16].plot().get_figure()
fig.savefig("Toal Crop Yield per Year 2011 to 21")

df_total_per_RM_11to21.nlargest(10,'Chickpeas')

gpd.GeoDataFrame(pd.merge(
    df_total_per_RM_11to21.nlargest(10,'Chickpeas'),
    gdf,
    on='RM'
)).explore(
    column='Total', tooltip = ['Municipality','Total']
)

gpd.GeoDataFrame(pd.merge(
    df_total_per_RM_11to21.nlargest(10,'Lentils'),
    gdf,
    on='RM'
)).explore(
    column='Total', tooltip = ['Municipality','Total']
)

gpd.GeoDataFrame(pd.merge(
    df_total_per_RM_11to21.nlargest(10,'Canola'),
    gdf,
    on='RM'
)).explore(
    column='Total', tooltip = ['Municipality','Total']
)

gpd.GeoDataFrame(pd.merge(
    df_total_per_RM_11to21.nlargest(10,'Mustard'),
    gdf,
    on='RM'
)).explore(
    column='Total', tooltip = ['Municipality','Total']
)

## Clustering

In [None]:
new_df = df_csv[df_csv['Year']>=2011].copy()

In [None]:
new_df.reset_index(drop=True,inplace=True)
new_df.drop('Year',axis=1,inplace=True)
sb.heatmap(new_df.corr(), annot=True)
plt.show()

#### Canola

In [None]:
new_df = df_csv[df_csv['Year']>=2011]
new_df.reset_index(inplace=True)

In [None]:
df_mean=new_df.groupby(['RM'])['Canola'].mean()\
    .to_frame()

In [None]:
df_std=new_df.groupby(['RM'])['Canola'].std()\
    .to_frame()

In [None]:
df_mean.reset_index(inplace=True)
df_std.reset_index(inplace=True)

In [None]:
df_ms=pd.merge(
    df_mean.rename(columns={'Canola': 'MeanYield'}),
    df_std.rename(columns={'Canola': 'StdYield'}),
    on='RM'
)

In [None]:
from sklearn.cluster import KMeans

# Define the features to be used for clustering
X = df_ms[['MeanYield', 'StdYield']]

# Elbow method to determine the optimal number of clusters
sse = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(X)
    sse.append(kmeans.inertia_)
    
# Plot the elbow graph
plt.plot(range(1, 11), sse)
plt.xlabel('Number of Clusters')
plt.ylabel('Sum of Squared Distances')
plt.savefig("elbow")
plt.show()


In [None]:
# Fit the KMeans model with the optimal number of clusters
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)

# Predict the cluster labels for each data point
labels = kmeans.predict(X)

# Plot the data points and cluster centers
plt.scatter(X['MeanYield'], X['StdYield'], c=labels)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], marker='x', s=200, linewidths=3, color='r')
plt.xlabel('MeanYield')
plt.ylabel('StdYield')
plt.savefig("cluster")
plt.show()


In [None]:
df_ms['Labels']=labels

In [None]:
df_ms.groupby('Labels').mean()\
    .sort_values('MeanYield')

In [None]:
df_ms.replace(to_replace={
    0:2,
    1:0,
    2:1}, inplace=True)


In [None]:
m=pd.merge(
    gdf[['RM','Municipality', 'geometry']],
    df_ms,
    on='RM'
    ).explore(
        column='Labels', # column you want to visualize
        cmap='YlGn', # color on the map
        k=3, # number of colors. We have 3 clusters
        scheme='naturalbreaks', #How your legened will look like
        tiles='Stamen Terrain', # map background type
        tooltip=['Municipality', 'MeanYield', 'Labels'] # what to show when you hover around your mouse
    )
m.save("Clustering_Canola.html")

#### Chickpeas

In [None]:
new_df = df_csv[df_csv['Year']>=2011].copy()
new_df.reset_index(inplace=True)

In [None]:
new_df['Chickpeas'].isna().sum()

In [None]:
new_df.dropna(subset=['Chickpeas'],inplace=True)

In [None]:
new_df['Chickpeas'].isna().sum()

In [None]:
df_mean=new_df.groupby(['RM'])['Chickpeas'].mean()\
    .to_frame()

In [None]:
df_std=new_df.groupby(['RM'])['Chickpeas'].std()\
    .to_frame().fillna(0)

In [None]:
df_mean.reset_index(inplace=True)
df_std.reset_index(inplace=True)

In [None]:
df_std['Chickpeas'].isna().sum()

In [None]:
df_ms=pd.merge(
    df_mean.rename(columns={'Chickpeas': 'MeanYield'}),
    df_std.rename(columns={'Chickpeas': 'StdYield'}),
    on='RM'
)

In [None]:
from sklearn.cluster import KMeans

# Define the features to be used for clustering
X = df_ms[['MeanYield', 'StdYield']]

# Elbow method to determine the optimal number of clusters
sse = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(X)
    sse.append(kmeans.inertia_)
    
# Plot the elbow graph
plt.plot(range(1, 11), sse)
plt.xlabel('Number of Clusters')
plt.ylabel('Sum of Squared Distances')
plt.show()

In [None]:
# Fit the KMeans model with the optimal number of clusters
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)

# Predict the cluster labels for each data point
labels = kmeans.predict(X)

# Plot the data points and cluster centers
plt.scatter(X['MeanYield'], X['StdYield'], c=labels)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], marker='x', s=200, linewidths=3, color='r')
plt.xlabel('MeanYield')
plt.ylabel('StdYield')
plt.show()

In [None]:
df_ms['Labels']=labels

In [None]:
df_ms.groupby('Labels').mean()\
    .sort_values('MeanYield')

In [None]:
#df_ms.replace(to_replace={
#    0:1,
#    1:0}, inplace=True
#)

In [None]:
m=pd.merge(
    gdf[['RM', 'Municipality', 'geometry']],
    df_ms,
    on='RM'
    ).explore(
        column='Labels', # column you want to visualize
        cmap='YlGn', # color on the map
        k=3, # number of colors. We have 3 clusters
        scheme='naturalbreaks', #How your legened will look like
        tiles='Stamen Terrain', # map background type
        tooltip=['Municipality', 'MeanYield', 'Labels'] # what to show when you hover around your mouse
    )
m.save("Clustering_Chickpeas.html")

#### Lentils

In [None]:
new_df = df_csv[df_csv['Year']>=2011].copy()
new_df.reset_index(inplace=True)

In [None]:
new_df['Lentils'].isna().sum()

In [None]:
new_df.dropna(subset=['Lentils'],inplace=True)

In [None]:
new_df['Lentils'].isna().sum()

In [None]:
df_mean=new_df.groupby(['RM'])['Lentils'].mean()\
    .to_frame()

In [None]:
df_std=new_df.groupby(['RM'])['Lentils'].std()\
    .to_frame().fillna(0)

In [None]:
df_mean.reset_index(inplace=True)
df_std.reset_index(inplace=True)

In [None]:
df_std['Lentils'].isna().sum()

In [None]:
df_ms=pd.merge(
    df_mean.rename(columns={'Lentils': 'MeanYield'}),
    df_std.rename(columns={'Lentils': 'StdYield'}),
    on='RM'
)

In [None]:
from sklearn.cluster import KMeans

# Define the features to be used for clustering
X = df_ms[['MeanYield', 'StdYield']]

# Elbow method to determine the optimal number of clusters
sse = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(X)
    sse.append(kmeans.inertia_)
    
# Plot the elbow graph
plt.plot(range(1, 11), sse)
plt.xlabel('Number of Clusters')
plt.ylabel('Sum of Squared Distances')
plt.show()

In [None]:
# Fit the KMeans model with the optimal number of clusters
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)

# Predict the cluster labels for each data point
labels = kmeans.predict(X)

# Plot the data points and cluster centers
plt.scatter(X['MeanYield'], X['StdYield'], c=labels)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], marker='x', s=200, linewidths=3, color='r')
plt.xlabel('MeanYield')
plt.ylabel('StdYield')
plt.show()

In [None]:
df_ms['Labels']=labels

In [None]:
df_ms.groupby('Labels').mean()\
    .sort_values('MeanYield')

In [None]:
df_ms.replace(to_replace={
    2:1,
#    0:1,
    1:2}, inplace=True
)

In [None]:
m=pd.merge(
    gdf[['RM', 'Municipality', 'geometry']],
    df_ms,
    on='RM'
    ).explore(
        column='Labels', # column you want to visualize
        cmap='YlGn', # color on the map
        k=3, # number of colors. We have 3 clusters
        scheme='naturalbreaks', #How your legened will look like
        tiles='Stamen Terrain', # map background type
        tooltip=['Municipality', 'MeanYield', 'Labels'] # what to show when you hover around your mouse
    )
m.save("Clustering_Lentils.html")

#### Mustard

In [None]:
new_df = df_csv[df_csv['Year']>=2011].copy()
new_df.reset_index(inplace=True)


In [None]:
new_df['Mustard'].isna().sum()

In [None]:
new_df.dropna(subset=['Mustard'],inplace=True)

In [None]:
new_df['Mustard'].isna().sum()

In [None]:
df_mean=new_df.groupby(['RM'])['Mustard'].mean()\
    .to_frame()

In [None]:
df_std=new_df.groupby(['RM'])['Mustard'].std()\
    .to_frame().fillna(0)

In [None]:
df_mean.reset_index(inplace=True)
df_std.reset_index(inplace=True)

In [None]:
df_std['Mustard'].isna().sum()

In [None]:
df_ms=pd.merge(
    df_mean.rename(columns={'Mustard': 'MeanYield'}),
    df_std.rename(columns={'Mustard': 'StdYield'}),
    on='RM'
)

In [None]:
from sklearn.cluster import KMeans

# Define the features to be used for clustering
X = df_ms[['MeanYield', 'StdYield']]

# Elbow method to determine the optimal number of clusters
sse = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(X)
    sse.append(kmeans.inertia_)
    
# Plot the elbow graph
plt.plot(range(1, 11), sse)
plt.xlabel('Number of Clusters')
plt.ylabel('Sum of Squared Distances')
plt.show()

In [None]:
# Fit the KMeans model with the optimal number of clusters
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)

# Predict the cluster labels for each data point
labels = kmeans.predict(X)

# Plot the data points and cluster centers
plt.scatter(X['MeanYield'], X['StdYield'], c=labels)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], marker='x', s=200, linewidths=3, color='r')
plt.xlabel('MeanYield')
plt.ylabel('StdYield')
plt.show()

In [None]:
df_ms['Labels']=labels

In [None]:
df_ms.groupby('Labels').mean()\
    .sort_values('MeanYield')

In [None]:
df_ms.replace(to_replace={
    0:1,
    1:2,
    2:0}, inplace=True
)

In [None]:
m=pd.merge(
    gdf[['RM','Municipality', 'geometry']],
    df_ms,
    on='RM'
    ).explore(
        column='Labels', # column you want to visualize
        cmap='YlGn', # color on the map
        k=3, # number of colors. We have 3 clusters
        scheme='naturalbreaks', #How your legend will look like
        tiles='Stamen Terrain', # map background type
        tooltip=['Municipality', 'MeanYield', 'Labels'] # what to show when you hover around your mouse
    )
m.save("Clustering_Mustard.html")

#### Canary Seed

In [None]:
new_df = df_csv[df_csv['Year']>=2011].copy()
new_df.reset_index(drop=True, inplace=True)

In [None]:
new_df

In [None]:
#sb.boxplot(new_df['Canary Seed'])
new_df.boxplot(column='Canary Seed',figsize=(4,4))
#plt.boxplot(new_df['Canary Seed'])
#plt.show()


In [None]:
sb.displot(new_df['Canary Seed'])

In [None]:
new_df['Canary Seed'].isna().sum()

In [None]:
new_df.dropna(subset=['Canary Seed'],inplace=True)

In [None]:
new_df['Canary Seed'].isna().sum()

In [None]:
df_mean=new_df.groupby(['RM'])['Canary Seed'].mean()\
    .to_frame()

In [None]:
df_std=new_df.groupby(['RM'])['Canary Seed'].std()\
    .to_frame().fillna(0)

In [None]:
df_mean.reset_index(inplace=True)
df_std.reset_index(inplace=True)

In [None]:
df_std['Canary Seed'].isna().sum()

In [None]:
df_ms=pd.merge(
    df_mean.rename(columns={'Canary Seed': 'MeanYield'}),
    df_std.rename(columns={'Canary Seed': 'StdYield'}),
    on='RM'
)

In [None]:
from sklearn.cluster import KMeans

# Define the features to be used for clustering
X = df_ms[['MeanYield', 'StdYield']]

# Elbow method to determine the optimal number of clusters
sse = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(X)
    sse.append(kmeans.inertia_)
    
# Plot the elbow graph
plt.plot(range(1, 11), sse)
plt.xlabel('Number of Clusters')
plt.ylabel('Sum of Squared Distances')
plt.show()

In [None]:
# Fit the KMeans model with the optimal number of clusters
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)

# Predict the cluster labels for each data point
labels = kmeans.predict(X)

# Plot the data points and cluster centers
plt.scatter(X['MeanYield'], X['StdYield'], c=labels)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], marker='x', s=200, linewidths=3, color='r')
plt.xlabel('MeanYield')
plt.ylabel('StdYield')
plt.show()

In [None]:
df_ms['Labels']=labels

In [None]:
df_ms.groupby('Labels').mean()\
    .sort_values('MeanYield')

In [None]:
df_ms.replace(to_replace={
    0:1,
    1:0}, inplace=True
)

In [None]:
m=pd.merge(
    gdf[['RM','Municipality', 'geometry']],
    df_ms,
    on='RM'
    ).explore(
        column='Labels', # column you want to visualize
        cmap='YlGn', # color on the map
        k=3, # number of colors. We have 3 clusters
        scheme='naturalbreaks', #How your legend will look like
        tiles='Stamen Terrain', # map background type
        tooltip=['Municipality', 'MeanYield', 'Labels'] # what to show when you hover around your mouse
    )
m.save("Clustering_CanarySeed.html")

In [None]:
df_csv[~df_csv['Chickpeas'].isna()]['RM'].unique()

In [None]:
df_csv['Chickpeas'].sum()
df_csv['Chickpeas'].count()
df_csv[~df_csv['Chickpeas'].isna()]['RM']
df_csv['RM'].nunique()
gpd.GeoDataFrame(pd.merge(
    df_csv[~df_csv['Chickpeas'].isna()].groupby('RM').mean(),
    gdf,
    on='RM'
)).explore(
    column='Chickpeas', tooltip = ['Municipality','Chickpeas']
)

In [None]:
df_total_sum = df_csv.copy()
df_total_sum.drop('Year',axis=1,inplace=True)
df_total_sum.drop('RM',axis=1,inplace=True)
df_total_sum = df_total_sum.sum()
fig=df_total_sum.sort_values(axis=0).plot(kind='barh',figsize=(8,8),title='Total Sum of Crop Yield (1938 - 2021)',\
                                         xlabel='Yield(bu/acre)').get_figure()
fig.savefig('Total Sum of Crop Yield 1938 to 2021')

