In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import geopandas as gpd
import seaborn as sb
plt.rcParams['figure.figsize'] = (4, 4)

# ETL (extract, transform, load)

## load csv, shp file

In [None]:
#load into main df_csv
df_csv = pd.read_csv('rm_crop_yields_1938_2021.csv')

In [None]:
df_csv.info()

In [None]:
df_csv.rename(columns={"Winter Wheat": "WinterWheat", "Spring Wheat": "SpringWheat","Fall Rye":"FallRye",\
             "Canary Seed":"CanarySeed","Spring Rye":"SpringRye","Tame Hay":"TameHay"},inplace=True)

In [None]:
#https://saskpulse.com
#https://www.rayglen.com/grain-conversion-calculator/
# (lbs/ac) -> Mustard (50 lbs/bu), Sunflower (30 lbs/bu), Lentils (60 lbs/bu), 
#             Canary Seeed (50 lbs/bu), Chickpeas (60lb/bu)

# Tame Hay (tons/ac)
# All the rest are bushel/acre (bu/ac)
df_csv['Mustard']=df_csv['Mustard']/50
df_csv['Sunflowers']=df_csv['Sunflowers']/30
df_csv['Lentils']=df_csv['Lentils']/60
df_csv['CanarySeed']=df_csv['CanarySeed']/50
df_csv['Chickpeas']=df_csv['Chickpeas']/60

In [None]:
#load shp data
gdf = gpd.read_file('Rural Municipality.shp')

#drop columns that won't be using
gdf.drop(['PPID','EFFDT','EXPDT','FEATURECD','SHAPE_AREA','SHAPE_LEN'],axis=1,inplace=True)

#rename column to match with main df
gdf.rename(columns=
{   'RMNO': 'RM',
    'RMNM': 'Municipality'
}, inplace=True)

#match data type between df_csv and gdf
gdf['RM']=gdf['RM'].astype('int')
gdf['Municipality']=gdf['Municipality'].astype('string')


In [None]:
gdf.info()

# EDA (Extrapolatory Data Analysis)

#### check for unique values

In [None]:
gdf['RM'].unique()
gdf['RM'].nunique()
#298 unique RM

#### check for duplicated values

In [None]:
gdf.duplicated().sum()

In [None]:
df_describe = df_csv.describe().copy()
df_describe
# Total of 25017 rows
# Year from 1938 to 2021 ~ 84 years
# 299 RM from 1 to 622 
# Spring Wheat, Barley and Oats have the most rows -> more complete data?
# Oats, Winter Wheat, Barley has the most mean -> most yield
# Tame Hay, Spring Rye, Flax have the least mean -> least yield

In [None]:
#count unique RM
df_csv['RM'].unique()
df_csv['RM'].nunique()

In [None]:
#check for duplicated rows
df_csv.duplicated().sum()

In [None]:
#check how many records there are each year
#-->not all have data for all years
temp_df= df_csv.groupby('Year').count()['RM']
temp_df = temp_df[temp_df!=299]
temp_df

In [None]:
#check for RM with less than 84 years of data
temp_df= df_csv.groupby('RM').count()['Year']
temp_df = temp_df[temp_df!=84]
temp_df

In [None]:
#Municipality with less than 84 years of data

pd.merge(temp_df,gdf,on='RM')

In [None]:
#check for RM not in geodata

temp_df= df_csv.groupby('RM').count().index
temp_df = temp_df[~temp_df.isin(gdf['RM'])]
temp_df

#278 Kutawa, Prairie No. 408, Greenfield No. 529

In [None]:
#check for geodata RM not in main data set

temp_df= df_csv.groupby('RM').count().index
temp_gdf = gdf[~gdf['RM'].isin(temp_df)]
temp_gdf

In [None]:

#df_csv.iloc[:,2:18].plot(x=df_csv.index, subplots=True, layout=(4,5))
temp_df= df_csv.copy()
temp_df['Year'] = pd.to_datetime(temp_df['Year'], format='%Y')
temp_df= temp_df.groupby('Year').mean().copy()
temp_df.drop(['RM'],axis=1,inplace=True)

for i in temp_df.columns:
    temp_df[i].plot(title='Average Crop per Year (1938-2021)',color=np.random.random(3))
 #   temp_df.columns.getloc(i)
#    plt.plot(temp_df[i], lw=0.8, color="green", label=temp_df[i].name)
    plt.xticks(rotation='vertical')
    plt.legend()
    plt.show()

# Feature Selection

In [None]:
sb.heatmap(
    df_csv.corr(),
    annot=True)

#Strong correlation between Canola and Spring Wheat, Durum, Oats, Peas, Barley, Flax
#Strong correlation between Spring Wheat and Barley, Canola, Durun, Oats, Flax
#Strong correlation between Durum and Spring Wheat, Barley, Flax
#Strong correlation between Sunflower and Tame Hay
#Strong correlation between Oats and Spring Wheat, Durumn, Barley, Flax
#Strong correlation between Peas and Canola, Spring Wheat, Barley
#Strong correlation between Barley and Oats, Canola, Spring Wheat, Barley, Fall Rye, Flax
#Strong correlation between Fall Rye and Spring Wheat, Barley, Flax
#Strong correlation between Tame Hay and Sunflowers
#Flax and Spring Wheat, Oats, Barley, Canola, Durum

In [None]:
df_csv.iloc[:,2:18].isna().sum().plot(kind='bar',ylabel='Count of missing values (1938-2021)',figsize=(6,6))

In [None]:
#df_csv[df_csv['Year']>=2011].iloc[:,2:18].isna().sum().plot(kind='bar',ylabel='Count of missing values (2011-2021)',figsize=(6,6))



# K-Means Clustering

In [None]:
from sklearn.cluster import KMeans

def call_ElbowMethod(df_meanstd):

# Define the features to be used for clustering
    X = df_meanstd[['MeanYield', 'StdYield']]

# Elbow method to determine the optimal number of clusters
    sse = []
    for k in range(1, 12):
        kmeans = KMeans(n_clusters=k)
        kmeans.fit(X)
        sse.append(kmeans.inertia_)
    
# Plot the elbow graph
    plt.plot(range(1, 12), sse)
    plt.xlabel('Number of Clusters')
    plt.ylabel('Sum of Squared Distances')
    plt.show()
    
    return X

In [None]:
def call_KMeans(kcluster,X):

# Fit the KMeans model with the optimal number of clusters
    kmeans = KMeans(n_clusters=kcluster)
    kmeans.fit(X)

# Predict the cluster labels for each data point
    labels = kmeans.predict(X)

# Plot the data points and cluster centers
    plt.scatter(X['MeanYield'], X['StdYield'], c=labels)
    plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], marker='x', s=200, linewidths=3, color='r')
    plt.xlabel('MeanYield')
    plt.ylabel('StdYield')
    plt.show()
    
    return labels

In [None]:
def showClusterMap(kcluster,croptype):

    m=pd.merge(
        gdf[['RM', 'Municipality', 'geometry']],
        df_ms,
        on='RM'
        ).explore(
        column='Labels', # column you want to visualize
        cmap='YlGn', # color on the map
        k=kcluster, # number of colors. We have 3 clusters
        scheme='naturalbreaks', #How your legened will look like
        tiles='Stamen Terrain', # map background type
        tooltip=['Municipality', 'MeanYield'] # what to show when you hover around your mouse
    )
    m.save("Clustering_"+croptype+".html")

## Clustering Spring Wheat 

In [None]:
new_df = df_csv[['Year','RM','SpringWheat']].copy()
new_df

In [None]:
new_df.isna().sum()

In [None]:
new_df[new_df['SpringWheat'].isna()].groupby('RM').count()

In [None]:
new_df[new_df['SpringWheat'].isna()].groupby('Year').count()

In [None]:
temp_df = new_df[new_df['RM']==555]
temp_df[temp_df['SpringWheat'].isna()]

In [None]:
gdf[gdf['RM']==555]

In [None]:
df_mean=new_df.groupby(['RM'])['SpringWheat'].mean()\
    .to_frame()

In [None]:
df_std=new_df.groupby(['RM'])['SpringWheat'].std()\
    .to_frame()

In [None]:
df_ms=pd.merge(
    df_mean.rename(columns={'SpringWheat': 'MeanYield'}),
    df_std.rename(columns={'SpringWheat': 'StdYield'}),
    on='RM'
)

In [None]:
cluster_X = call_ElbowMethod(df_ms)

In [None]:
df_ms['Labels']=call_KMeans(3,cluster_X)

In [None]:
df_ms.groupby('Labels').mean()\
    .sort_values('MeanYield')

In [None]:
df_ms.replace(to_replace={
    0:1,
    2:0,
    1:2}, inplace=True
)

In [None]:
showClusterMap(3,'SpringWheat')

## Clustering Barley

In [None]:
new_df = df_csv[['Year','RM','Barley']].copy()
new_df

In [None]:
new_df.isna().sum()

In [None]:
new_df[new_df['Barley'].isna()].groupby('RM').count()

In [None]:
new_df[new_df['Barley'].isna()].groupby('Year').count()

In [None]:
df_mean=new_df.groupby(['RM'])['Barley'].mean()\
    .to_frame()

In [None]:
df_std=new_df.groupby(['RM'])['Barley'].std()\
    .to_frame()

In [None]:
df_ms=pd.merge(
    df_mean.rename(columns={'Barley': 'MeanYield'}),
    df_std.rename(columns={'Barley': 'StdYield'}),
    on='RM'
)

In [None]:
cluster_X = call_ElbowMethod(df_ms)

In [None]:
df_ms['Labels']=call_KMeans(3,cluster_X)

In [None]:
df_ms.groupby('Labels').mean()\
    .sort_values('MeanYield')

In [None]:
df_ms.replace(to_replace={
    0:2,
    2:0}, inplace=True
)

In [None]:
showClusterMap(3,'Barley')

## Clustering Oats

In [None]:
new_df = df_csv[['Year','RM','Oats']].copy()
new_df

In [None]:
new_df.isna().sum()

In [None]:
new_df[new_df['Oats'].isna()].groupby('RM').count()

In [None]:
new_df[new_df['Oats'].isna()].groupby('Year').count()

In [None]:
df_mean=new_df.groupby(['RM'])['Oats'].mean()\
    .to_frame()

In [None]:
df_std=new_df.groupby(['RM'])['Oats'].std()\
    .to_frame()

In [None]:
df_ms=pd.merge(
    df_mean.rename(columns={'Oats': 'MeanYield'}),
    df_std.rename(columns={'Oats': 'StdYield'}),
    on='RM'
)

In [None]:
cluster_X = call_ElbowMethod(df_ms)

In [None]:
df_ms['Labels']=call_KMeans(3,cluster_X)

In [None]:
df_ms.groupby('Labels').mean()\
    .sort_values('MeanYield')

In [None]:
df_ms.replace(to_replace={
    1:2,
    0:1,
    2:0}, inplace=True
)

In [None]:
showClusterMap(3,'Oats')