In [6]:
# Install required packages in the notebook environment if they're missing
%pip install pandas seaborn matplotlib numpy ipywidgets

import os
import io
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display
import importlib

# Ensure ipywidgets is available and import it (try to install if missing).
# Using a single import location avoids duplicate imports later in the cell.
try:
    import ipywidgets as widgets
except Exception:
    try:
        print("ipywidgets import failed; attempting to install ipywidgets...")
        %pip install ipywidgets
        # Invalidate caches to help the runtime pick up the newly installed package
        importlib.invalidate_caches()
        import ipywidgets as widgets
    except Exception:
        print("ipywidgets could not be installed or imported. Using a minimal fallback for widgets.FileUpload.")
        class _DummyUpload:
            def __init__(self, *args, **kwargs):
                # mimic the interface used later in the notebook
                self.value = {}
        widgets = type("widgets", (), {"FileUpload": _DummyUpload})()

# 1. Load the Dataset
# Make sure the csv name matches your file
file_name = 'yield_df.csv'
df = None

if os.path.exists(file_name):
    df = pd.read_csv(file_name)
else:
    print(f"File '{file_name}' not found in the notebook working directory: {os.getcwd()}")
    # Try a local file dialog (works on local machines with GUI)
    try:
        import tkinter as tk
        from tkinter import filedialog
        root = tk.Tk()
        root.withdraw()
        path = filedialog.askopenfilename(title="Select CSV file", filetypes=[("CSV files", "*.csv"), ("All files", "*.*")])
        if path:
            df = pd.read_csv(path)
            print(f"Loaded file from: {path}")
        else:
            print("No file selected from file dialog.")
    except Exception:
        # Fallback to ipywidgets FileUpload for notebook environments
        try:
            upload = widgets.FileUpload(accept='.csv', multiple=False)
            print("Please upload the CSV file using the widget below, then re-run this cell.")
            display(upload)
            # If the user already uploaded before running, try to load it immediately
            if upload.value:
                uploaded_filename = list(upload.value.keys())[0]
                content = upload.value[uploaded_filename]['content']
                df = pd.read_csv(io.BytesIO(content))
                print(f"Loaded uploaded file: {uploaded_filename}")
        except Exception:
            print("Could not open a file dialog or upload widget. Proceeding with a small example dataset so the rest of the notebook can run.")
            # Create a small example DataFrame that matches expected columns so the notebook can continue
            df = pd.DataFrame({
                'Area': ['CountryA', 'CountryB', 'CountryC', 'CountryA', 'CountryB'],
                'Item': ['Wheat', 'Corn', 'Rice', 'Wheat', 'Corn'],
                'average_rain_fall_mm_per_year': [500, 700, 1200, 550, 650],
                'pesticides_tonnes': [1.2, 0.8, 2.0, 1.1, 0.9],
                'avg_temp': [22.5, 24.0, 26.1, 21.9, 23.5],
                'hg/ha_yield': [350, 420, 300, 360, 410]
            })
            print("A small example DataFrame was created and assigned to 'df'. Replace it with your real CSV when available.")

# Ensure df is never None so downstream cells do not error out
if df is None:
    df = pd.DataFrame()
    print(f"Warning: '{file_name}' was not loaded and no fallback data was created. 'df' is an empty DataFrame.")

# 2. Initial Inspection
print("--- First 5 Rows ---")
print(df.head())
print("\n--- Data Info ---")
print(df.info()) # Checks for data types and nulls

# 3. Data Cleaning
# Drop the index column that comes with this specific dataset (it's noise)
if 'Unnamed: 0' in df.columns:
    df = df.drop('Unnamed: 0', axis=1)

# Check for missing values
print("\n--- Missing Values ---")
print(df.isnull().sum())

# If there are duplicates, drop them
df = df.drop_duplicates()

# 4. Rename columns for easier coding (Optional but helpful)
df.rename(columns={
    'Area': 'Country', 
    'Item': 'Crop',
    'average_rain_fall_mm_per_year': 'Rainfall_mm',
    'pesticides_tonnes': 'Pesticides_tonnes',
    'avg_temp': 'Avg_Temp_C', 
    'hg/ha_yield': 'Yield_hg_ha'
}, inplace=True)

print("\n--- Cleaned Column Names ---")
print(df.columns)

# 5. Exploratory Data Analysis (EDA) - Crucial for your Report!
# Let's look at the correlation between numerical variables
# This helps us see which features actually affect the Yield.
numeric_df = df.select_dtypes(include=[np.number]) # Select only numbers for heatmap

# Guard against empty numeric data which causes seaborn/matplotlib reduction errors
if numeric_df.empty or numeric_df.shape[1] == 0:
    print("No numeric columns available for correlation heatmap. Skipping heatmap.")
else:
    try:
        corr = numeric_df.corr()
        if corr.empty or corr.shape[0] == 0:
            print("Correlation matrix is empty. Skipping heatmap.")
        else:
            plt.figure(figsize=(10, 8))
            sns.heatmap(corr, annot=True, cmap='coolwarm')
            plt.title("Correlation Heatmap: What affects Yield?")
            plt.show()
    except ValueError as e:
        # Catch reduction/plotting related errors and inform the user
        print(f"Could not create heatmap due to: {e}. Skipping heatmap.")

# Save the cleaned data for Day 2 (safe even if df is empty)
df.to_csv('cleaned_yield_data.csv', index=False)
print("Cleaned dataset saved successfully!")

Note: you may need to restart the kernel to use updated packages.
File 'yield_df.csv' not found in the notebook working directory: c:\Users\user\Desktop\JULY 2025 COHORT\PLP_AI\Assignments\Assignment-2\AI_Machine_Learning_Assignment
Please upload the CSV file using the widget below, then re-run this cell.



[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


FileUpload(value=(), accept='.csv', description='Upload')

--- First 5 Rows ---
Empty DataFrame
Columns: []
Index: []

--- Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Empty DataFrame
None

--- Missing Values ---
Series([], dtype: float64)

--- Cleaned Column Names ---
Index([], dtype='object')
No numeric columns available for correlation heatmap. Skipping heatmap.
Cleaned dataset saved successfully!
