In [1]:
# importing libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [5]:
# 1. Load the dataset
data = pd.read_csv(r"C:\Users\gpshi\OneDrive\Desktop\py_dataset33.csv")

In [7]:
# 2. Initial Data Exploration (EDA)
print("Dataset Info:")
print(data.info())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 15 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Crop                        3000 non-null   object 
 1   Temperature (°C)            3000 non-null   float64
 2   Humidity (%)                3000 non-null   float64
 3   Rainfall (mm)               3000 non-null   float64
 4   Soil Type                   3000 non-null   object 
 5   Soil pH                     3000 non-null   float64
 6   Soil Moisture (%)           3000 non-null   float64
 7   Wind Speed (km/h)           3000 non-null   float64
 8   Solar Radiation (W/m²)      3000 non-null   float64
 9   Weather Condition           3000 non-null   object 
 10  Precipitation (mm)          3000 non-null   float64
 11  Evapotranspiration (ET)     3000 non-null   float64
 12  Growth Stage                3000 non-null   object 
 13  Kc                 

In [9]:
print("\nStatistical Summary:")
print(data.describe())


Statistical Summary:
       Temperature (°C)  Humidity (%)  Rainfall (mm)      Soil pH  \
count       3000.000000   3000.000000    3000.000000  3000.000000   
mean          25.860367     63.588233      64.104267     6.472967   
std            4.485384     11.208561      30.422029     0.487276   
min           15.000000     40.000000      20.000000     5.500000   
25%           22.975000     55.100000      40.100000     6.100000   
50%           26.400000     63.500000      56.500000     6.400000   
75%           29.100000     71.900000      82.925000     6.800000   
max           34.900000     89.900000     149.900000     7.500000   

       Soil Moisture (%)  Wind Speed (km/h)  Solar Radiation (W/m²)  \
count        3000.000000        3000.000000             3000.000000   
mean           27.760867           8.218533              246.171167   
std             5.066757           2.959062               31.054844   
min            18.000000           2.100000              180.500000   
2

In [11]:
print("\nMissing Values:")
print(data.isnull().sum())


Missing Values:
Crop                          0
Temperature (°C)              0
Humidity (%)                  0
Rainfall (mm)                 0
Soil Type                     0
Soil pH                       0
Soil Moisture (%)             0
Wind Speed (km/h)             0
Solar Radiation (W/m²)        0
Weather Condition             0
Precipitation (mm)            0
Evapotranspiration (ET)       0
Growth Stage                  0
Kc                            0
Water Requirement (mm/day)    0
dtype: int64


In [13]:
print("Shape of the dataset:", data.shape) 

Shape of the dataset: (3000, 15)


In [15]:
# 3. Data Cleaning & Handling Missing Values
data = data.fillna({
    'Temperature (°C)': data['Temperature (°C)'].mean(),
    'Humidity (%)': data['Humidity (%)'].mean(),
    'Rainfall (mm)': data['Rainfall (mm)'].mean(),
    'Soil Type': data['Soil Type'].mode()[0],
    'Weather Condition': data['Weather Condition'].mode()[0]
})

In [17]:
# 4. Handle Inconsistencies in Categorical Columns (if any)
# Eg: standardizing text
data['Soil Type'] = data['Soil Type'].str.strip().str.lower()
data['Weather Condition'] = data['Weather Condition'].str.strip().str.lower()

In [19]:
# 5. Encode Categorical Variables
categorical_cols = data.select_dtypes(include=['object']).columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

In [21]:
# 6. Scale Numerical Features
numerical_cols = ['Temperature (°C)', 'Humidity (%)', 'Rainfall (mm)']
scaler = StandardScaler()
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

In [23]:
# 7. Prepare Features and Target
X = data.drop('Water Requirement (mm/day)', axis=1)
y = data['Water Requirement (mm/day)']

In [25]:
# 8. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [29]:
# To verify
print("\nTraining set size:", X_train.shape)
print("Testing set size:", X_test.shape)


Training set size: (2400, 14)
Testing set size: (600, 14)
