# All Imports

In [None]:
from shapely.geometry import Point
from sklearn.model_selection import train_test_split
import ast
import ast  # To convert string representation of list to actual list
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns

# Remove Duplicates

In [3]:
# Load the CSV file
file_path = "Karnataka_Datasets/Across/Begalavi/Begalavi.csv"  # Replace with your actual file path
df = pd.read_csv(file_path)

# Remove duplicate rows
df = df.drop_duplicates()

# Save it back to the same file
df.to_csv(file_path, index=False)

print("Duplicate rows removed and file updated successfully.")

Duplicate rows removed and file updated successfully.


# Mapping Points to Districts

In [6]:
districts = gpd.read_file("Karnataka_Datasets/Indian_districts_boundary.shp")

In [8]:
# Step 2: Load the CSV file (latitude and longitude points)
points_df = pd.read_csv("Karnataka_Datasets/Crops_Karnataka.csv")

# Ensure the CSV contains 'latitude' and 'longitude' columns
if not {'latitude', 'longitude'}.issubset(points_df.columns):
    raise ValueError("CSV must contain 'latitude' and 'longitude' columns.")

# Step 3: Convert points to a GeoDataFrame
geometry = [Point(xy) for xy in zip(points_df['longitude'], points_df['latitude'])]
points_gdf = gpd.GeoDataFrame(points_df, geometry=geometry, crs=districts.crs)

# Step 4: Perform spatial join to find districts for each point
result = gpd.sjoin(points_gdf, districts, how="left", predicate="intersects")

# Step 5: Save the result to a new CSV
result.to_csv("Karnataka_Datasets/Crops_Karnataka_With_Districts.csv", index=False)

print("Spatial join completed. Output saved to 'output_with_districts.csv'.")

Spatial join completed. Output saved to 'output_with_districts.csv'.


# Removing Unnecessary Columns

In [11]:

# Load the CSV file
file_path = "Karnataka_Datasets/Crops_Karnataka_With_Districts.csv"  # Replace with your actual file path
df = pd.read_csv(file_path)

# List of columns to remove
columns_to_remove = ["CropSurveyDate","Crop_Extent","District_Name","District_code",
                     "Hobli_Name","Hobli_code","Image_url","Month","Season","Season_code","Survey_id",
                     "Taluk_Name","Taluk_code","Village_Name","Village_code","Weekname","Year_code","Years",
                     "index_right","snippet","visibility","extrude","descriptio","tessellate","drawOrder","icon","end","begin",
                     "altitudeMo","timestamp"
]  # Replace with actual column names

# Drop the columns if they exist
df = df.drop(columns=[col for col in columns_to_remove if col in df.columns])

# Save it back to the same file
df.to_csv(file_path, index=False)

print("Unnecessary columns removed and file updated successfully.")

Unnecessary columns removed and file updated successfully.


# Print Count Of Each District

In [27]:
# Load the CSV file
file_path = "Karnataka_Datasets/Across/Train_Test_Datasets/Combined_Train_Balanced_Structure.csv"  # Replace with your actual file path
df = pd.read_csv(file_path)

# Count occurrences of each unique value in the "Name" column
name_counts = df["Height"].value_counts()

# Print the counts
print(name_counts)

Height
Tall      7647
Medium    6724
Short     6119
Name: count, dtype: int64


# Filter Out Bangalore

In [14]:

# Load the CSV file
file_path = "Karnataka_Datasets/Crops_Karnataka_With_Districts.csv"  # Replace with your actual file path
output_file = "Karnataka_Datasets/Crops_Karnataka_Bangalore.csv"  # Specify the new file name

df = pd.read_csv(file_path)

# Filter out rows where Name is "Bangalore"
df = df[df["Name"] == "Bangalore"]

# Save it to a different file
df.to_csv(output_file, index=False)

print(f"Rows with Name = 'Bangalore' have saved to '{output_file}'.")

Rows with Name = 'Bangalore' have saved to 'Karnataka_Datasets/Crops_Karnataka_Bangalore.csv'.


# Keep A Count of Each Cropname

In [5]:
# Load the CSV file
file_path = "Karnataka_Datasets/Across/Kharif/Bellari/Bellari_SAR_NDVI_Interpolated.csv"  # Replace with your actual file path
df = pd.read_csv(file_path)

# Count occurrences of each unique value in the "Name" column
name_counts = df["Height"].value_counts()
pd.set_option('display.max_rows', None)
# Print the counts
print(name_counts)

Height
Short     5300
Medium    1275
Tall        63
Name: count, dtype: int64


# Get A Count of Each Duration Type

In [13]:
# Load the CSV file
file_path = "Karnataka_Datasets/District_Data_NDVI_5Day/Merged/Karnataka_Scaled_Normalized_Mapped.csv"  # Replace with your actual file path
df = pd.read_csv(file_path)

# Count occurrences of each unique value in the "Name" column
name_counts = df["duration_numeric"].value_counts()

# Print the counts
print(name_counts)

duration_numeric
1    9259
3    5944
2    5228
Name: count, dtype: int64


# Sample Random Points

In [39]:
# Load the filtered crops dataset
input_file = "Karnataka_Datasets/Bangalore/Crops_Karnataka_Bangalore_Filtered.csv"  # Use the previously generated file
output_file = "Karnataka_Datasets/Bangalore/Crops_Karnataka_Bangalore_Sampled_On_Duration.csv"  # New file to save the sampled data

df = pd.read_csv(input_file)

# Separate by Duration
df_low = df[df["Duration"] == "Low"]
df_medium = df[df["Duration"] == "Medium"]
df_long = df[df["Duration"] == "Long"]

# Get unique low-duration crop counts
low_crop_counts = df_low["Cropname"].value_counts()

# Define sample size for Low crops
target_low_samples = 11000

# Step 1: Take all samples from small "Low" crops first (excluding Ragi)
small_low_crops = ["Bajra", "Potato", "Redgram"]  # Prioritized small crops
selected_low = []
remaining_samples = target_low_samples

for crop in small_low_crops:
    if crop in low_crop_counts:
        crop_data = df_low[df_low["Cropname"] == crop]
        selected_low.append(crop_data)
        remaining_samples -= len(crop_data)

# Step 2: If space remains, sample proportionally from other Low crops (except Ragi)
other_low_crops = df_low[~df_low["Cropname"].isin(["Ragi"] + small_low_crops)]
if remaining_samples > 0 and not other_low_crops.empty:
    sampled_other = other_low_crops.sample(n=min(remaining_samples, len(other_low_crops)), random_state=42)
    selected_low.append(sampled_other)
    remaining_samples -= len(sampled_other)

# Step 3: If space is still left, add Ragi to reach 11,000
if remaining_samples > 0:
    ragi_sample = df_low[df_low["Cropname"] == "Ragi"].sample(n=remaining_samples, random_state=42)
    selected_low.append(ragi_sample)

# Merge selected low-duration crops
df_low_sampled = pd.concat(selected_low, ignore_index=True)

# Combine all durations together
df_final = pd.concat([df_low_sampled, df_medium, df_long], ignore_index=True)

# Save the final balanced dataset
df_final.to_csv(output_file, index=False)

print(f"Balanced dataset saved to '{output_file}' with improved distribution.")

Balanced dataset saved to 'Karnataka_Datasets/Bangalore/Crops_Karnataka_Bangalore_Sampled_On_Duration.csv' with improved distribution.


# Height and Duration Mapping

In [14]:

# Load your CSV file
df = pd.read_csv('Katihar/Katihar_5Day.csv')

crop_mapping = {
    'Bajra': ('Short', 'Low'),
    'Banana': ('Tall', 'Long'),
    'Coconut': ('Tall', 'Long'),
    'Jowar': ('Medium', 'Low'),
    'Maize': ('Medium', 'Medium'),
    'Paddy': ('Short', 'Medium'),
    'Potato': ('Short', 'Medium'),
    'Ragi': ('Short', 'Low'),
    'Redgram': ('Medium', 'Medium'),
    'Wheat': ('Short', 'Low'),
    'Eucalyptus': ('Tall', 'Long'),
    'Rose': ('Short', 'Medium'),
    'HariMirch': ('Short', 'Low'),
    'Eknayaka': ('Short', 'Medium'),
    'Chow Chow': ('Medium', 'Medium'),
    'Beans': ('Short', 'Low'),
    'Ridgegourd': ('Medium', 'Medium'),
    'Bottlegourd': ('Medium', 'Medium'),
    'Marigold': ('Short', 'Medium'),
    'Gerbera': ('Short', 'Medium'),
    'Chrysanthemum': ('Short', 'Medium'),
    'Tomato': ('Short', 'Medium'),
    'Horsegram': ('Medium', 'Medium'),
    'Avare': ('Medium', 'Medium'),
    'Coriander': ('Short', 'Low'),
    'Silver Oak': ('Tall', 'Long'),
    'Neem': ('Tall', 'Long'),
    'Guava': ('Medium', 'Long'),
    'Dahlia': ('Short', 'Medium'),
    'Trees': ('Tall', 'Long'),
    'Elephantfoot_Yam': ('Medium', 'Long'),
    'Malabar Neem': ('Tall', 'Long'),
    'Sweetcorn': ('Medium', 'Medium'),
    'Cucumber': ('Medium', 'Medium'),
    'Carrot': ('Short', 'Medium'),
    'Mangoes': ('Tall', 'Long'),
    'Makkachari': ('Short', 'Low'),
    'Flower': ('Short', 'Medium'),
    'Cabbage': ('Short', 'Medium'),
    'Asparagus': ('Short', 'Medium'),
    'Teak': ('Tall', 'Long'),
    'Drumstick': ('Medium', 'Long'),
    'Vegetables': ('Short', 'Medium'),
    'Beetroot': ('Short', 'Medium'),
    'Cauliflower': ('Short', 'Medium'),
    'Mint': ('Short', 'Low'),
    'Methi Leaves': ('Short', 'Low'),
    'Dill': ('Short', 'Low'),
    'Noni': ('Medium', 'Long'),
    'Ivygourd': ('Medium', 'Medium'),
    'Brinjal': ('Short', 'Medium'),
    'Green Fodder': ('Medium', 'Medium'),
    'Acacia': ('Tall', 'Long'),
    'Arecanut': ('Tall', 'Long'),
    'Papaya': ('Medium', 'Long'),
    'Chiku': ('Tall', 'Long'),
    'Fruits': ('Medium', 'Long'),
    'Spinach': ('Short', 'Low'),
    'Grapes': ('Medium', 'Long'),
    'Fennel': ('Short', 'Low'),
    'Capsicum': ('Short', 'Medium'),
    'Bittergourd': ('Medium', 'Medium'),
    'Mulberry': ('Medium', 'Long'),
    'Knolkhol': ('Short', 'Medium'),
    'Crossandra': ('Short', 'Medium'),
    'Agaves': ('Tall', 'Long'),
    'Gladiolus': ('Short', 'Medium'),
    'BlackPepper': ('Tall', 'Long'),
    'Groundnut': ('Short', 'Long'),
    'Barley': ('Short', 'Low'),
    'Tuberose': ('Short', 'Medium'),
    'Lemon': ('Medium', 'Long'),
    'Sugarcane': ('Tall', 'Long'),
    'Peas': ('Short', 'Medium'),
    'Indigo': ('Medium', 'Medium'),
    'China Aster': ('Short', 'Medium'),
    'LadyFinger': ('Short', 'Medium'),
    'Vegetable Cowpea': ('Short', 'Low'),
    'Harimirch': ('Short', 'Low'),
    'Dhavana': ('Short', 'Low'),
    'Tamarind': ('Tall', 'Long'),
    'Turmeric': ('Short', 'Long'),
    'Cowpea': ('Short', 'Low'),
    'Dolichuous_Bean': ('Medium', 'Medium'),
    'Pumpkin': ('Medium', 'Medium'),
    'Broccoli': ('Short', 'Medium'),
    'Sunflower': ('Medium', 'Medium'),
    'Sweet Corn': ('Medium', 'Medium'),
    'Rose Wood': ('Tall', 'Long'),
    'Baby Corn': ('Medium', 'Low'),
    'Ginger': ('Short', 'Long'),
    'Sweet Potato': ('Short', 'Long'),
    'Shimp Nut': ('Medium', 'Medium'),
    'Snakegourd': ('Medium', 'Medium'),
    'Pomegranate': ('Medium', 'Long'),
    'Jamun': ('Tall', 'Long'),
    'Greengram': ('Short', 'Low'),
    'Cashewnuts': ('Tall', 'Long'),
    'Saffron': ('Short', 'Low'),
    'Chive': ('Short', 'Low'),
    'Mahagani Tree': ('Tall', 'Long'),
    'Casuarina Tree': ('Tall', 'Long'),
    'Bluegrapes': ('Medium', 'Long'),
    'Jasmine Pubescens': ('Short', 'Medium'),
    'Gram': ('Short', 'Low'),
    'Grapes Seedless': ('Medium', 'Long'),
    'Ashgourd': ('Medium', 'Medium'),
    'Nigerseed': ('Short', 'Low'),
    'Jackfruit': ('Tall', 'Long'),
    'Sandalwood': ('Tall', 'Long'),
    'Linseed': ('Short', 'Low'),
    'Curry Leaves': ('Short', 'Low'),
    'Wheat': ('Short', 'Low'),
    'Mustard': ('Short', 'Low'),
    'Maize': ('Medium', 'Medium'),
    'Sugarcane': ('Tall', 'Long'),
    'Lentil': ('Short', 'Medium'),
    'Rice': ('Short', 'Medium'),
    'Gram': ('Short', 'Low'),
    'Garlic': ('Short', 'Medium'),
    'Potato': ('Short', 'Medium'),
    'Green pea': ('Short', 'Medium'),
    'Bersem': ('Short', 'Medium'),
    'Coriander': ('Short', 'Low'),
}



height_mapping = {'Short': 1, 'Medium': 2, 'Tall': 3}
duration_mapping = {'Low': 1, 'Medium': 2, 'Long': 3}

# Apply the mapping to the DataFrame
df[['Height', 'Duration']] = df['Crop_Name'].map(crop_mapping).apply(pd.Series)
df['height_numeric'] = df['Height'].map(height_mapping)
df['duration_numeric'] = df['Duration'].map(duration_mapping)

# Save the updated DataFrame
df.to_csv('Katihar/Katihar_5Day.csv', index=False)

print("File updated successfully.")

File updated successfully.
