In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.mixture import GaussianMixture
import shutil
import os
from tqdm import tqdm
import datetime

In [7]:
date = datetime.datetime.today().strftime("%Y_%m_%d")

In [8]:
todo_path = r'D:\Crop_Classification\to_do_path'
done_path = r'D:\Crop_Classification\done_path'
out_path  = r'D:\Crop_Classification\Out_path'

In [9]:
cluster_list_file_path = r'D:\Crop_Classification\Maize districts with all crops.xlsx'

In [10]:
clus_list = pd.read_excel(cluster_list_file_path, sheet_name = 'Maize districts with crops')

clus_list.District = clus_list.District.apply(lambda x: str.lower(x))
clus_list.fillna(-9999, inplace=True)

In [11]:
temp = clus_list.drop(['State','above_1000','Grand Total'], axis=1)

In [12]:
cols = temp.drop(['District'],axis=1).columns

In [13]:
dis_dominant_crop = {}
for d in temp.District:
    dom_crop = pd.DataFrame(columns=['crop','acreage'])
    for col in cols:
        if(temp.loc[temp.District == d, col].values > 1000):
            min_temp = pd.DataFrame({'crop':col, 'acreage':temp.loc[temp.District == d, col].values})
            dom_crop = dom_crop.append(min_temp)
    dom_crop.reset_index(inplace=True, drop=True)
    dom_crop.sort_values(['acreage'], ascending=False, inplace=True)
    dom_crop.reset_index(inplace=True, drop=True)
    dis_dominant_crop[d] = dom_crop.crop.values
#     break

In [14]:
results = pd.DataFrame(columns=['District','1SD','2SD','3SD','Cultivable Area'])

In [15]:
for file in tqdm(os.listdir(todo_path)):
    try:
        print(file)
        dist = ''.join(file.lower().split('.')[0:-1])
        print('District: ', dist)
        if(dist == 'sas nagar'):
            dist = 's.a.s nagar'
        df_copy = pd.read_csv(os.path.join(todo_path, file))
        df_copy.dropna(inplace=True)
        df_copy.reset_index(inplace=True, drop=True)
        df = df_copy.copy()
        df.rename(columns={'Geo_X':'X', 'Geo_Y':'Y'}, inplace=True)
        df.rename(columns={'Geo_X':'X', ' Geo_Y':'Y'}, inplace=True)
        full_count = len(df)
        df_copy = df.copy()

        X = df.drop(['X','Y'], axis=1)

        b_mean = {}
        b_std = {}
        b_data = {}
        b_upp1 = {}
        b_low1 = {}
        b_upp2 = {}
        b_low2 = {}
        b_upp3 = {}
        b_low3 = {}
        sd_perc = {}
        for i in X.columns:
            b_mean[i] = np.mean(X[i])
            b_std[i] = np.std(X[i])

            b_upp1[i] = b_mean[i] + 1*b_std[i]
            b_low1[i] = b_mean[i] - 1*b_std[i]

            b_upp2[i] = b_mean[i] + 2*b_std[i]
            b_low2[i] = b_mean[i] - 2*b_std[i]

            b_upp3[i] = b_mean[i] + 3*b_std[i]
            b_low3[i] = b_mean[i] - 3*b_std[i]

        df = df_copy.copy()
        for i in X.columns:
            df.drop(df[np.logical_or(df[i] < b_low1[i], df[i] > b_upp1[i])].index, axis=0, inplace=True)
            df.reset_index(inplace=True, drop=True)
        sd_perc['1SD'] = (len(df)/full_count)*100
        df = df_copy.copy()
        for i in X.columns:
            df.drop(df[np.logical_or(df[i] < b_low2[i], df[i] > b_upp2[i])].index, axis=0, inplace=True)
            df.reset_index(inplace=True, drop=True)
        sd_perc['2SD'] = (len(df)/full_count)*100
        df = df_copy.copy()
        for i in X.columns:
            df.drop(df[np.logical_or(df[i] < b_low3[i], df[i] > b_upp3[i])].index, axis=0, inplace=True)
            df.reset_index(inplace=True, drop=True)
        sd_perc['3SD'] = (len(df)/full_count)*100

        ###  Clustering
#         num_clusters = clus_list[clus_list.District == dist]['above_1000'].values[0]
        num_clusters = 3*clus_list[clus_list.District == dist]['above_1000'].values[0] # number of clusters increased to 3 times

        gmix = GaussianMixture(n_components=num_clusters)
        gmix.fit(X)

        gmix_preds = gmix.predict(X)

        df = df_copy.copy()
        df_gmix = df.copy()
        df_gmix['Class'] = gmix_preds

#         cluster_perc = pd.DataFrame(columns=['Cluster','Perc'])
#         for i in df_gmix.Class.unique():
#             temp = pd.DataFrame({'Cluster':i,'Perc':((len(df_gmix[df_gmix.Class == i])/len(df_gmix))*100)}, index=[0])
#             cluster_perc = cluster_perc.append(temp)
#         cluster_perc.reset_index(inplace=True, drop=True)
#         cluster_perc.sort_values(['Perc'], ascending=False, inplace=True)
#         cluster_perc.reset_index(inplace=True, drop=True)

#         cluster_perc['Crop'] = None
#         for num, crop in enumerate(dis_dominant_crop[dist]):
#             cluster_perc.loc[num, 'Crop'] = crop

#         df_gmix['Cluster'] = df_gmix.Class
#         for clus, crp in zip(cluster_perc.Cluster, cluster_perc.Crop):
#             df_gmix.loc[df_gmix.Class == clus,'Class'] = crp

        df_gmix.to_csv(os.path.join(out_path,'{}_Clustered.csv'.format(dist)), index=False)

        try:
            shutil.move(os.path.join(todo_path,file), os.path.join(done_path))
        except:
            print('Cannot move file to Done folder, {} already present'.format(file))
        data = {'District':dist,'1SD':sd_perc['1SD'],
                '2SD':sd_perc['2SD'],'3SD':sd_perc['3SD'],
                'Cultivable Area':len(df_gmix)/100}
        r_temp = pd.DataFrame(data,[0])
        results = results.append(r_temp)
        results.reset_index(inplace=True, drop=True)
    except Exception as e:
        print(e)
        print(dist," Skipped")
        continue
results.to_excel('Results_{}.xlsx'.format(date))

  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

Baramulla.csv
District:  baramulla


 50%|█████████████████████████████████████████▌                                         | 1/2 [03:09<03:09, 189.30s/it]

Ramban.csv
District:  ramban


100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [06:29<00:00, 194.75s/it]
