## Objective

In [None]:
# Are you looking for to find optimal location for canola crush
# Are you looking to help an investment purposes
# 

## ETL

In [None]:
# Reading Essential Libraries
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt

#Directories
general_directory = '/Users/Roy/Desktop/PaletteSkills/Stream-3__AI_ML_Doc/Palette_Cohort_7-main/Final-project/data/'
precip_directory = 'july_2023_precip_sum.csv'
rm_gis_directory = 'rm_geo_json.geojson'
rm_yields_directory='rm_yields_2004_2023_weighted_avg.csv'

# # Loading datasets
df_precip = pd.read_csv(general_directory+precip_directory)
df_yield = pd.read_csv(general_directory+rm_yields_directory)
gdf_rm = gpd.read_file(general_directory+rm_gis_directory, engine="pyogrio")
df_yield

In [None]:
# Finding historical mean and std for each rm for Feature Engineering
df_yield_agg=df_yield.drop(columns='Year').groupby(['Matched_RM', 'Crop'])\
    .agg(['mean', 'std'])
# Renaming Columns
df_yield_agg.columns=['_'.join(col).strip() for col in df_yield_agg.columns]
# Resetting index
df_yield_final=df_yield_agg.reset_index()

# Dropping FIELD PEAS
df_yield_final=df_yield_final[df_yield_final['Crop']!='FIELD PEAS']

df_yield_final=df_yield_final.set_index(['Matched_RM', 'Crop'])

# Dropping Null values if there is any
df_yield_final=df_yield_final.dropna()

#df_yield_final=df_yield_final.reset_index()

df_yield_final


## Exploratory Data Analysis

In [None]:
pd.merge(
    gdf_rm,
    df_yield_final[df_yield_final['Crop']=='BARLEY'],
    on='Matched_RM').plot(
        'Weighted_Avg_Yield_mean',
        legend=True,
        cmap='RdYlGn',
        figsize=(12,8)
    )



## Feature Engineering

We have created mean and std. See above

## Unsupervised ML models

### Parameter Tuning

#### Elbow Method

In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Function to calculate WCSS (Within-Cluster-Sum-of-Squares) for the Elbow method
def calculate_wcss(data):
    wcss = []
    for i in range(1, 11):
        kmeans = KMeans(n_clusters=i, random_state=42)
        kmeans.fit(data)
        wcss.append(kmeans.inertia_)
    return wcss

# List of crops for which you want to plot the elbow method
crops = ['ARGENTINE CANOLA', 'BARLEY', 'OATS', 'RED SPRING WHEAT']

# Initialize the plot for a 2x2 grid of subplots
fig, axs = plt.subplots(2, 2, figsize=(10, 10))

for idx, crop in enumerate(crops):
    # Filter data by crop using .loc
    crop_data = df_yield_final.loc[df_yield_final.index.get_level_values('Crop') == crop]

    
    # Extract the 'Weighted_Avg_Yield_mean' and 'Weighted_Avg_Yield_std' as features
    X = crop_data[['Weighted_Avg_Yield_mean', 'Weighted_Avg_Yield_std']].values

    # Normalize the data using StandardScaler
    scaler = StandardScaler()
    scaled_X = scaler.fit_transform(X)
    
    # Calculate WCSS for the Elbow Method
    wcss = calculate_wcss(scaled_X)

    print(wcss)
    
    # Plotting each subplot
    row, col = divmod(idx, 2)
    axs[row, col].plot(range(1, 11), wcss, marker='o')
    axs[row, col].set_title(f'Elbow Method for {crop}')
    axs[row, col].set_xlabel('Number of clusters')
    axs[row, col].set_ylabel('WCSS')

plt.tight_layout()
plt.show()

#### Silhouette Score

In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

# Function to calculate silhouette scores for different numbers of clusters
def calculate_silhouette(data):
    silhouette_scores = []
    for i in range(2, 11):  # Silhouette score cannot be calculated for 1 cluster
        kmeans = KMeans(n_clusters=i, random_state=42)
        labels = kmeans.fit_predict(data)
        score = silhouette_score(data, labels)
        silhouette_scores.append(score)
    return silhouette_scores

# List of crops for which you want to plot the silhouette score
crops = ['ARGENTINE CANOLA', 'BARLEY', 'OATS', 'RED SPRING WHEAT']

# Initialize the plot for a 2x2 grid of subplots
fig, axs = plt.subplots(2, 2, figsize=(10, 10))

for idx, crop in enumerate(crops):
    # Filter data by crop using .loc
    crop_data = df_yield_final.loc[df_yield_final.index.get_level_values('Crop') == crop]
    
    # Extract the 'Weighted_Avg_Yield_mean' and 'Weighted_Avg_Yield_std' as features
    X = crop_data[['Weighted_Avg_Yield_mean', 'Weighted_Avg_Yield_std']].values
    
    # Normalize the data using StandardScaler
    scaler = StandardScaler()
    scaled_X = scaler.fit_transform(X)
    
    # Calculate Silhouette scores
    silhouette_scores = calculate_silhouette(scaled_X)
    
    # Plotting each subplot
    row, col = divmod(idx, 2)
    axs[row, col].plot(range(2, 11), silhouette_scores, marker='o')
    axs[row, col].set_title(f'Silhouette Score for {crop}')
    axs[row, col].set_xlabel('Number of clusters')
    axs[row, col].set_ylabel('Silhouette Score')

plt.tight_layout()
plt.show()

## Model Training

In [92]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# List of crops you are working with
crops = ['ARGENTINE CANOLA', 'BARLEY', 'OATS', 'RED SPRING WHEAT']

# Creating new columns for 4 clusters and 5 clusters
df_yield_final['Cluster_4'] = None
#df_yield_final['Cluster_5'] = None

# Loop over each crop and RM and apply clustering
for crop in crops:
    # Filter the data by crop
    crop_data = df_yield_final.loc[df_yield_final.index.get_level_values('Crop') == crop]
    
    # Extract features for clustering
    X = crop_data[['Weighted_Avg_Yield_mean', 'Weighted_Avg_Yield_std']].values
    
    # Normalize the data using StandardScaler
    scaler = StandardScaler()
    scaled_X = scaler.fit_transform(X)
    
    # Apply KMeans clustering for 4 clusters
    kmeans_4 = KMeans(n_clusters=4, random_state=42)
    df_yield_final.loc[df_yield_final.index.get_level_values('Crop') == crop, 'Cluster_4'] = kmeans_4.fit_predict(scaled_X)
    
    # Apply KMeans clustering for 5 clusters
    # kmeans_5 = KMeans(n_clusters=5, random_state=42)
    # df_yield_final.loc[df_yield_final.index.get_level_values('Crop') == crop, 'Cluster_5'] = kmeans_5.fit_predict(scaled_X)

df_yield_final.to_csv('df_yield_final.csv')

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [93]:
pd.merge(gdf_rm,
         df_yield_final.reset_index()[df_yield_final.reset_index()['Crop']=='ARGENTINE CANOLA'],
         on='Matched_RM'
         ).explore(column='Cluster_4')

  uniques = Index(uniques)


In [68]:
# Standard Scaler or Z-Score
# Z-Score shows how many std the data point is from the mean
mean=33
std=8
value1 = 35
value2 = 30
(value2-mean)/std

In [95]:
df_yield_final=df_yield_final.reset_index().drop(columns='Matched_RM').groupby(['Cluster_4', 'Crop']).mean()
df_yield_final

Unnamed: 0_level_0,Unnamed: 1_level_0,Weighted_Avg_Yield_mean,Weighted_Avg_Yield_std
Cluster_4,Crop,Unnamed: 2_level_1,Unnamed: 3_level_1
0,ARGENTINE CANOLA,39.158259,7.95253
0,BARLEY,45.227375,23.857143
0,OATS,92.959088,21.136298
0,RED SPRING WHEAT,39.661056,13.451865
1,ARGENTINE CANOLA,31.488107,9.109478
1,BARLEY,69.720061,16.074019
1,OATS,67.77663,24.035776
1,RED SPRING WHEAT,49.445923,10.684993
2,ARGENTINE CANOLA,14.5,1.555635
2,BARLEY,53.701668,13.54273
