# Data Science in Agriculture – Practical Notebook
**Student:** Rajat Mishra  
**Course:** M.Sc. Data Science with Big Data Analytics  

Yeh Colab notebook aapke **saare 6 practicals** ke liye sample code contain karta hai.
Aap exam se pehle isko run karke samajh sakte ho, modify bhi kar sakte ho.


In [None]:
# Common imports for multiple practicals
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams['figure.figsize'] = (8, 4)
sns.set(style='whitegrid')


## Practical 1 – Web Scraping and Exploratory Data Analysis (EDA)

**Aim:**
- Kisi website se data scrape karna (HTML table).
- Data ko pandas DataFrame mein convert karna.
- Basic EDA karna (head(), info(), describe(), plots).

⚠️ *Note:* Colab mein run karne ke liye internet required hoga. Exam lab mein agar internet na ho,
to aap explanation + offline CSV loading dikhakar EDA part run kar sakte ho.


In [None]:
import requests
from bs4 import BeautifulSoup

# Example URL: Wikipedia table (you can change as per requirement)
url = 'https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)'

response = requests.get(url)
print('Status code:', response.status_code)

soup = BeautifulSoup(response.content, 'html.parser')

# Find first table on the page
table = soup.find('table', {'class': 'wikitable'})
rows = table.find_all('tr')

data = []
for row in rows[1:]:  # skip header
    cols = [col.get_text(strip=True) for col in row.find_all(['th', 'td'])]
    data.append(cols)

# Convert to DataFrame (trim to first 5 columns for clarity)
columns = [col.get_text(strip=True) for col in rows[0].find_all(['th', 'td'])][:5]
df_scraped = pd.DataFrame([r[:5] for r in data], columns=columns)

print('Scraped DataFrame shape:', df_scraped.shape)
df_scraped.head()

In [None]:
# Basic EDA on scraped data
print('\n--- HEAD ---')
print(df_scraped.head())

print('\n--- INFO ---')
print(df_scraped.info())

print('\n--- DESCRIBE (numeric only after conversion) ---')
df_numeric = df_scraped.copy()
for col in df_numeric.columns[2:]:  # try convert some columns to numeric
    df_numeric[col] = pd.to_numeric(df_numeric[col].str.replace(',', ''), errors='coerce')

print(df_numeric.describe())

# Example bar plot (Top 10 by population if available)
df_plot = df_numeric.dropna().head(10)
plt.figure()
plt.bar(df_plot.iloc[:, 0], df_plot.iloc[:, 2])
plt.xticks(rotation=45, ha='right')
plt.title('Top 10 – Example EDA from Scraped Data')
plt.ylabel('Value')
plt.tight_layout()
plt.show()

## Practical 2 – Climate Prediction Using Time Series (ARIMA)

**Aim:** Daily temperature jaisa time series data generate karke ARIMA model se
future values predict karna, aur actual vs predicted ko compare karna.


In [None]:
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn.metrics import mean_squared_error

# 1. Create synthetic daily temperature data (2019–2023)
date_index = pd.date_range(start='2019-01-01', end='2023-12-31', freq='D')
n_points = len(date_index)

season_pattern = 10 * np.sin(2 * np.pi * date_index.dayofyear / 365)
linear_trend = 0.005 * np.arange(n_points)
random_component = np.random.normal(loc=0, scale=1, size=n_points)

temperature_values = 20 + season_pattern + linear_trend + random_component

weather_df = pd.DataFrame({'Date': date_index, 'Temperature': temperature_values}).set_index('Date')
weather_df.head()

In [None]:
# Plot original time series
plt.figure()
plt.plot(weather_df.index, weather_df['Temperature'])
plt.title('Daily Temperature Series (2019–2023)')
plt.xlabel('Date')
plt.ylabel('Temperature (°C)')
plt.tight_layout()
plt.show()

In [None]:
# Train-Test split (Train: 2019–2022, Test: 2023)
train_cutoff = '2022-12-31'
train_data = weather_df.loc[:train_cutoff]
test_data = weather_df.loc[train_cutoff:]

print('Train size:', len(train_data))
print('Test size:', len(test_data))

In [None]:
# Differencing to make series stationary
diff_series = train_data['Temperature'].diff().dropna()

plt.figure()
plt.plot(diff_series)
plt.title('Differenced Temperature Series (Train)')
plt.xlabel('Date')
plt.ylabel('Differenced Temp')
plt.tight_layout()
plt.show()

# ACF and PACF plots (for p,q selection)
plot_acf(diff_series, lags=40)
plt.tight_layout()
plt.show()

plot_pacf(diff_series, lags=40, method='ywm')
plt.tight_layout()
plt.show()

In [None]:
# Fit ARIMA model (example order (2,1,1))
p, d, q = 2, 1, 1
model = ARIMA(train_data['Temperature'], order=(p, d, q))
model_fit = model.fit()
print(model_fit.summary())

In [None]:
# Forecast for test period
steps = len(test_data)
forecast_obj = model_fit.get_forecast(steps=steps)
forecast = forecast_obj.predicted_mean
forecast_bounds = forecast_obj.conf_int(alpha=0.05)

rmse = np.sqrt(mean_squared_error(test_data['Temperature'], forecast))
print('RMSE:', rmse)

# Plot Actual vs Predicted
plt.figure(figsize=(10,5))
plt.plot(train_data.index, train_data['Temperature'], label='Training Data')
plt.plot(test_data.index, test_data['Temperature'], label='Actual Test Data')
plt.plot(test_data.index, forecast, label='Forecast', linestyle='--')

plt.fill_between(
    test_data.index,
    forecast_bounds['lower Temperature'],
    forecast_bounds['upper Temperature'],
    alpha=0.3,
    label='95% Confidence Interval'
)

plt.title('ARIMA Temperature Forecast vs Actual')
plt.xlabel('Date')
plt.ylabel('Temperature (°C)')
plt.legend()
plt.tight_layout()
plt.show()

## Practical 3 – Crop Yield Prediction Using Machine Learning

**Aim:** Synthetic crop dataset bana kar regression model (e.g., RandomForestRegressor)
use karke yield predict karna.


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# Create synthetic crop dataset
np.random.seed(42)
n_samples = 300

soil_types = np.random.choice(['Loam', 'Sandy', 'Clay'], size=n_samples)
rainfall = np.random.randint(300, 1200, size=n_samples)
fertilizer = np.random.randint(50, 250, size=n_samples)
area = np.random.uniform(0.5, 5.0, size=n_samples)

# Simple rule-based yield (just for synthetic data)
base_yield = 2 + 0.003 * rainfall + 0.01 * fertilizer + 0.5 * area
soil_effect = np.where(soil_types == 'Loam', 1.0,
               np.where(soil_types == 'Clay', 0.4, -0.5))
noise = np.random.normal(0, 0.7, size=n_samples)

yield_tph = base_yield + soil_effect + noise

df_crop = pd.DataFrame({
    'Soil_Type': soil_types,
    'Rainfall_mm': rainfall,
    'Fertilizer_kg': fertilizer,
    'Area_hectare': area,
    'Yield_tph': yield_tph
})

df_crop.head()

In [None]:
# Train-test split
X = df_crop.drop('Yield_tph', axis=1)
y = df_crop['Yield_tph']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess: OneHotEncode soil type, pass numeric as is
categorical_features = ['Soil_Type']
numeric_features = ['Rainfall_mm', 'Fertilizer_kg', 'Area_hectare']

categorical_transformer = OneHotEncoder(handle_unknown='ignore')
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', 'passthrough', numeric_features)
    ]
)

model = RandomForestRegressor(n_estimators=200, random_state=42)

clf = Pipeline(steps=[('preprocessor', preprocessor),
                     ('model', model)])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print('R2-score:', r2_score(y_test, y_pred))

In [None]:
# Plot predicted vs actual
plt.figure()
plt.scatter(y_test, y_pred)
plt.xlabel('Actual Yield')
plt.ylabel('Predicted Yield')
plt.title('Crop Yield – Actual vs Predicted')
plt.tight_layout()
plt.show()

## Practical 4 – Plant Disease Detection Using CNN (Image Classification)

**Aim:**
- CNN architecture define karna (Keras/TensorFlow use karke).
- Train/validation generators set karna (folder based images).
- Model compile & train.

⚠️ *Note:* Yahan hum example code de rahe hain. Real run ke liye
aapko `train/` aur `validation/` folders me images arrange karni hongi.


In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

IMG_SIZE = (128, 128)
BATCH_SIZE = 32

# Data generators (Assuming folder structure: train/healthy, train/diseased, etc.)
train_datagen = ImageDataGenerator(rescale=1./255,
                                   rotation_range=20,
                                   width_shift_range=0.1,
                                   height_shift_range=0.1,
                                   zoom_range=0.1,
                                   horizontal_flip=True)

val_datagen = ImageDataGenerator(rescale=1./255)

# Update these paths as per your dataset in Colab or local
train_dir = '/content/plant_data/train'
val_dir = '/content/plant_data/val'

train_gen = train_datagen.flow_from_directory(
    train_dir,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical'
)

val_gen = val_datagen.flow_from_directory(
    val_dir,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical'
)

In [None]:
# Simple CNN model
num_classes = len(train_gen.class_indices)

model_cnn = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=IMG_SIZE + (3,)),
    MaxPooling2D(2, 2),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(2, 2),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D(2, 2),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

model_cnn.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

model_cnn.summary()

In [None]:
# Train the CNN model (adjust epochs as needed)
history = model_cnn.fit(
    train_gen,
    epochs=10,
    validation_data=val_gen
)

## Practical 5 – Agricultural Product Price Prediction

**Aim:** Synthetic mandi price time series bana kar regression ya simple ML model
use karke future price ya next-day price predict karna.


In [None]:
# Synthetic daily price series (e.g., for wheat)
date_price = pd.date_range(start='2022-01-01', end='2023-12-31', freq='D')
n_points = len(date_price)

trend = 0.05 * np.arange(n_points)
season = 20 * np.sin(2 * np.pi * date_price.dayofyear / 365)
noise = np.random.normal(0, 5, size=n_points)

base_price = 1500  # base MSP like value
prices = base_price + trend + season + noise

df_price = pd.DataFrame({'Date': date_price, 'Price': prices}).set_index('Date')
df_price.head()

In [None]:
# Create supervised dataset: use previous 7 days to predict next day price
window = 7
X_list, y_list = [], []
price_values = df_price['Price'].values

for i in range(len(price_values) - window):
    X_list.append(price_values[i:i+window])
    y_list.append(price_values[i+window])

X_price = np.array(X_list)
y_price = np.array(y_list)

X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(X_price, y_price, test_size=0.2, random_state=42)

from sklearn.ensemble import GradientBoostingRegressor

price_model = GradientBoostingRegressor(random_state=42)
price_model.fit(X_train_p, y_train_p)

y_pred_p = price_model.predict(X_test_p)
print('RMSE (Price Prediction):', np.sqrt(mean_squared_error(y_test_p, y_pred_p)))

In [None]:
# Plot few actual vs predicted prices
plt.figure()
plt.plot(y_test_p[:50], label='Actual')
plt.plot(y_pred_p[:50], label='Predicted', linestyle='--')
plt.title('Agricultural Price Prediction – Sample')
plt.xlabel('Sample Index')
plt.ylabel('Price')
plt.legend()
plt.tight_layout()
plt.show()

## Practical 6 – Optimizing Irrigation Using Data Analytics

**Aim:** Synthetic field-level data (soil moisture, rainfall, crop type, area) ka use karke
clustering ya rule-based approach se irrigation suggestion dena.


In [None]:
from sklearn.cluster import KMeans

np.random.seed(10)
n_fields = 200

soil_moisture = np.random.uniform(10, 60, size=n_fields)   # percentage
recent_rainfall = np.random.uniform(0, 50, size=n_fields)  # mm last 3 days
crop_stage = np.random.choice([1, 2, 3], size=n_fields)    # 1=early, 2=mid, 3=late

df_irrig = pd.DataFrame({
    'Soil_Moisture': soil_moisture,
    'Recent_Rainfall': recent_rainfall,
    'Crop_Stage': crop_stage
})

df_irrig.head()

In [None]:
# KMeans clustering to group fields
X_irrig = df_irrig.values
kmeans = KMeans(n_clusters=3, random_state=10)
clusters = kmeans.fit_predict(X_irrig)

df_irrig['Cluster'] = clusters
df_irrig.head()

In [None]:
# Simple interpretation – based on cluster centroids
centers = kmeans.cluster_centers_
centers_df = pd.DataFrame(centers, columns=['Soil_Moisture', 'Recent_Rainfall', 'Crop_Stage'])
print('Cluster centers:')
print(centers_df)

def irrigation_recommendation(row):
    # Very simple rule-based example
    if row['Soil_Moisture'] < 25 and row['Recent_Rainfall'] < 10:
        return 'HIGH irrigation required'
    elif row['Soil_Moisture'] < 40:
        return 'MEDIUM irrigation required'
    else:
        return 'LOW or NO irrigation'

df_irrig['Irrigation_Advice'] = df_irrig.apply(irrigation_recommendation, axis=1)
df_irrig.head(10)

In [None]:
# Plot clusters (using Soil Moisture vs Recent Rainfall)
plt.figure()
scatter = plt.scatter(df_irrig['Soil_Moisture'], df_irrig['Recent_Rainfall'], c=df_irrig['Cluster'])
plt.xlabel('Soil Moisture (%)')
plt.ylabel('Recent Rainfall (mm)')
plt.title('Field Clusters for Irrigation Planning')
plt.legend(*scatter.legend_elements(), title='Cluster')
plt.tight_layout()
plt.show()