In [None]:
import pandas as pd
import numpy as np
import pycaret.clustering as pc
#pip install -U --pre pycaret
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
plt.style.use('fivethirtyeight')
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('../dataset/FinalData.csv')

In [None]:
data.head()

In [None]:
# lets check if there is any missing value present in the dataset
data.isna().sum()

In [None]:
#Checking the basic information of dataset like count, datatypes of columns (categorical or numerical)
data.info()

In [None]:
#All columns are numerical except label which is categorical

In [None]:
#Removing duplicate rows if any
data.drop_duplicates()

In [None]:
data.shape

In [None]:
# Checking whether dataset is balanced or not
data['label'].value_counts()

In [None]:
#Checking the unique classes in label (output)
data['label'].unique()

In [None]:
#Checking the unique classes in label (output)
data['label'].nunique()

In [None]:
data.describe(include="all")

In [None]:
#Shows summary statistics of datset. Here we can know the data quality is good by checking how close in mean and median.
#In this dataset temperature and ph seems to have better distribution, other columns may have outliers.

In [None]:
# lets check the Summary for all the crops

print("Average Ratio of Nitrogen in the Soil : {0:.2f}".format(data['N'].mean()))
print("Average Ratio of Phosphorous in the Soil : {0:.2f}".format(data['P'].mean()))
print("Average Ratio of Potassium in the Soil : {0:.2f}".format(data['K'].mean()))
print("Average Tempature in Celsius : {0:.2f}".format(data['temperature'].mean()))
print("Average Relative Humidity in % : {0:.2f}".format(data['humidity'].mean()))
print("Average PH Value of the soil : {0:.2f}".format(data['ph'].mean()))
print("Average Rainfall in mm : {0:.2f}".format(data['rainfall'].mean()))

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15, 9))
sns.heatmap(data.corr(), annot=True,cmap='viridis')
ax.set(xlabel='features')
ax.set(ylabel='features')
#f,ax=plt.subplots(figsize=(5,5))
plt.title('Correlation between different features', fontsize = 15, c='black')
plt.show()

In [None]:
#Correlation between Potassium and Phosphorus is high.

In [None]:
crop_summary = pd.pivot_table(data,index=['label'],aggfunc='mean')
crop_summary.head()

In [None]:
### Data Visualizations

plt.rcParams['figure.figsize'] = (16,10)

plt.subplot(2, 4, 1)
sns.barplot(data['N'], data['label'])
plt.ylabel(' ')
plt.xlabel('Ratio of Nitrogen', fontsize = 10)
plt.yticks(fontsize = 10)

plt.subplot(2, 4, 2)
sns.barplot(data['P'], data['label'])
plt.ylabel(' ')
plt.xlabel('Ratio of Phosphorous', fontsize = 10)
plt.yticks(fontsize = 10)

plt.subplot(2, 4, 3)
sns.barplot(data['K'], data['label'])
plt.ylabel(' ')
plt.xlabel('Ratio of Potassium', fontsize = 10)
plt.yticks(fontsize = 10)

plt.subplot(2, 4, 4)
sns.barplot(data['temperature'], data['label'])
plt.ylabel(' ')
plt.xlabel('Temperature', fontsize = 10)
plt.yticks(fontsize = 10)

plt.subplot(2, 4, 5)
sns.barplot(data['humidity'], data['label'])
plt.ylabel(' ')
plt.xlabel('Humidity', fontsize = 10)
plt.yticks(fontsize = 10)

plt.subplot(2, 4, 6)
sns.barplot(data['ph'], data['label'])
plt.ylabel(' ')
plt.xlabel('pH of Soil', fontsize = 10)
plt.yticks(fontsize = 10)

plt.subplot(2, 4, 7)
sns.barplot(data['rainfall'], data['label'])
plt.ylabel(' ')
plt.xlabel('Rainfall', fontsize = 10)
plt.yticks(fontsize = 10)

plt.suptitle('Visualizing the Impact of Different Conditions on Crops', fontsize = 15)
plt.show()

Clustering Smilar Crops

In [None]:
#setting up our cluster 
#normalization is for scalling the dataset to have similar scale
#ignore the categorial column(target)

clust = pc.setup(data, ignore_features=['label'], session_id=123)

KMEANS 

In [None]:

kmeans = pc.create_model("kmeans") 

In [None]:
print(kmeans)

In [None]:
#availanle models in pycaret clustering
pc.models()

In [None]:
pc.plot_model(kmeans)

In [None]:
pc.plot_model(kmeans, plot="elbow") 
 #shows the optimal number of cluster to use

In [None]:
pc.plot_model(kmeans, plot="silhouette")

In [None]:
pc.plot_model(kmeans, plot="cluster")

In [None]:
pc.plot_model(kmeans, plot="distribution")

In [None]:
kmeans_result = pc.assign_model(kmeans)

In [None]:
kmeans_result = kmeans_result.join(data['label'])

In [None]:
kmeans_result

In [None]:
kmeans_result = kmeans_result.reindex(columns=['N','P','K',"temperature",'humidity','ph','rainfall','label','Cluster'])

In [None]:
kmeans_result

In [None]:
cluster0 = kmeans_result[kmeans_result['Cluster']=='Cluster 0']
cluster1 = kmeans_result[kmeans_result['Cluster']=='Cluster 1']
cluster2 = kmeans_result[kmeans_result['Cluster']=='Cluster 2']
cluster3 = kmeans_result[kmeans_result['Cluster']=='Cluster 3']

In [None]:
cluster0 = list(set(cluster0['label']))
cluster1 = list(set(cluster1['label']))
cluster2 = list(set(cluster2['label']))
cluster3 = list(set(cluster3['label']))

In [None]:
print("crops found in First_cluster are : "+",".join(cluster0)+"\n")
print("crops found in Second_cluster are : "+",".join(cluster1)+"\n")
print("crops found in Third_cluster are : "+",".join(cluster2)+"\n")
print("crops found in Forth_cluster are : "+",".join(cluster3)+"\n")

In [None]:
#channge the value of cluster to be numbers only
kmeans_result2 = kmeans_result.copy()

In [None]:
kmeans_result2.to_csv('../dataset/kmeans_result2.csv')

In [None]:
kmeans_result2 = pd.read_csv('../dataset/kmeans_result2.csv')

In [None]:
kmeans_result2['Cluster'] = kmeans_result2['Cluster'].str.replace("Cluster",'').apply(int)

In [None]:
kmeans_result2

In [None]:
plt.scatter(kmeans_result2['humidity'],kmeans_result['temperature'],c=kmeans_result2['Cluster'],cmap='rainbow')

In [None]:
pc.save_model(kmeans, "../model/crop-model")

In [None]:
saved_model = pc.load_model("../model/crop-model")

In [None]:
unknown_data = pd.DataFrame([{'N':70,'P':59,'K':43,'rainfall':202,'humidity':82,'ph':7,'temperature':20}])

In [None]:
pred = pc.predict_model(saved_model,unknown_data)

In [None]:
pred

In [None]:
pred = int(pred['Cluster'][0][-1])

In [None]:
kmeans_result=kmeans_result2[kmeans_result2['Cluster']==pred]

In [None]:
kmeans_result

In [None]:
crops = list(set(kmeans_result['label']))

In [None]:
print("crops recommended for particular variables are :\n "+",".join(crops)+"\n")