In [2]:
# import necessary libraries
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer



In [3]:
import pandas as pd
import random

# Function to generate synthetic dataset with realistic fuel consumption calculations
def generate_synthetic_data(num_samples=1500):
    # Available implements and their standard HP requirements
    implements = ["Plow", "Harrow", "Rotavator", "Seeder", "Sprayer"]
    soil_types = ["Loamy", "Clay", "Sandy", "Silty"]
    terrain_conditions = ["Flat", "Slightly Hilly", "Hilly", "Very Hilly"]
    
    # Standard HP requirement for each implement
    standard_hp = {
        "Plow": 50,
        "Harrow": 40,
        "Rotavator": 45,
        "Seeder": 30,
        "Sprayer": 20
    }
    
    # Terrain and soil efficiency factors
    terrain_efficiency = {"Flat": 0.9, "Slightly Hilly": 0.8, "Hilly": 0.7, "Very Hilly": 0.6}
    soil_resistance = {"Loamy": 1.0, "Clay": 1.2, "Sandy": 0.8, "Silty": 1.1}
    
    data = []

    for _ in range(num_samples):
        # Randomly select an implement and generate random values
        implement = random.choice(implements)
        load_percentage = random.uniform(50, 100)  # Load percentage between 50% and 100%
        speed = random.uniform(2, 10)  # Speed in km/h
        area_covered = random.uniform(0.5, 5)  # Area in hectares
        HP_of_tractor = random.uniform(30, 100)  # HP of the tractor
        
        # Generate working depth and width based on the implement type
        if implement in ["Plow", "Harrow", "Rotavator"]:
            working_depth = random.uniform(5, 30)  # Depth in cm
            working_width = None
        else:
            working_depth = None
            working_width = random.uniform(1, 4)  # Width in meters

        # Select random soil type and terrain condition
        soil_type = random.choice(soil_types)
        terrain_condition = random.choice(terrain_conditions)

        # Calculate the field efficiency factor based on soil and terrain
        efficiency_factor = terrain_efficiency[terrain_condition] * soil_resistance[soil_type]

        # Calculate the fuel consumption factor (CF) using load, speed, HP, and implement properties
        if implement in ["Plow", "Harrow", "Rotavator"]:
            cf = (load_percentage / 100) * speed * (working_depth / 10) * (HP_of_tractor / standard_hp[implement])
        else:
            cf = (load_percentage / 100) * speed * working_width * (HP_of_tractor / standard_hp[implement])
        
        # Prevent division by zero or negative CF
        cf = max(cf, 0.1)

        # Calculate fuel consumption with some noise and efficiency factor
        fuel_consumed = area_covered * cf * efficiency_factor * random.uniform(0.8, 1.2)
        fuel_efficiency = fuel_consumed / area_covered  # Liters per hectare

        # Store the generated data in a list
        data.append([
            implement, load_percentage, speed, area_covered, HP_of_tractor,
            working_depth, working_width, round(fuel_consumed, 2), 
            round(fuel_efficiency, 2), soil_type, terrain_condition
        ])

    # Define column names for the DataFrame
    columns = [
        "Implement", "Load_Percentage", "Speed", "Area_Covered", "HP_of_Tractor",
        "Working_Depth", "Working_Width", "Fuel_Consumed", "Fuel_Efficiency",
        "Soil_Type", "Terrain_Condition"
    ]
    
    # Return the generated data as a pandas DataFrame
    return pd.DataFrame(data, columns=columns)



In [4]:
# Generate synthetic dataset with 1500 samples
df = generate_synthetic_data(num_samples=1500)



In [5]:
df.head(20)

Unnamed: 0,Implement,Load_Percentage,Speed,Area_Covered,HP_of_Tractor,Working_Depth,Working_Width,Fuel_Consumed,Fuel_Efficiency,Soil_Type,Terrain_Condition
0,Sprayer,77.697038,8.382046,4.156897,89.218436,,3.705381,230.27,55.4,Sandy,Hilly
1,Harrow,70.315759,7.217708,1.985688,74.262975,15.927367,,21.92,11.04,Silty,Slightly Hilly
2,Seeder,78.35607,5.990942,1.3571,42.659111,,2.289578,16.3,12.01,Silty,Slightly Hilly
3,Seeder,96.368562,2.783136,1.777399,38.748544,,3.347624,14.05,7.9,Clay,Hilly
4,Seeder,99.960055,4.831449,2.106862,90.190487,,1.863376,43.28,20.54,Clay,Hilly
5,Sprayer,89.464749,3.255823,3.156366,90.438543,,2.78464,85.28,27.02,Clay,Very Hilly
6,Harrow,83.801401,3.699356,4.689264,80.029552,20.791613,,53.46,11.4,Silty,Hilly
7,Harrow,60.554939,8.093816,3.784626,36.484805,28.15824,,24.89,6.58,Sandy,Very Hilly
8,Rotavator,62.034286,6.301096,1.071101,84.292419,8.267809,,4.35,4.06,Loamy,Hilly
9,Plow,97.665438,7.644479,2.578079,38.694042,27.377545,,27.71,10.75,Silty,Very Hilly


In [6]:
df.describe()

Unnamed: 0,Load_Percentage,Speed,Area_Covered,HP_of_Tractor,Working_Depth,Working_Width,Fuel_Consumed,Fuel_Efficiency
count,1500.0,1500.0,1500.0,1500.0,937.0,563.0,1500.0,1500.0
mean,74.950943,6.049679,2.745458,64.421702,17.026364,2.476127,39.03566,14.220167
std,14.556429,2.328599,1.295516,19.95595,7.00098,0.894541,50.576448,14.861673
min,50.107658,2.010676,0.500656,30.096324,5.056763,1.004518,0.47,0.67
25%,62.294131,3.9891,1.597492,46.894749,10.920086,1.69339,10.585,4.935
50%,74.915902,6.071643,2.770816,64.070141,16.990964,2.511317,21.98,9.755
75%,87.945296,8.096965,3.84208,80.750867,22.639631,3.234979,48.5825,17.9225
max,99.994824,9.999241,4.998816,99.965201,29.977556,3.993823,612.36,151.48


In [7]:

# Function to detect and remove outliers using IQR
def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)  # 1st Quartile (25th percentile)
    Q3 = df[column].quantile(0.75)  # 3rd Quartile (75th percentile)
    IQR = Q3 - Q1  # Interquartile Range

    # Define the upper and lower bounds for outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Filter out the outliers
    cleaned_df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    
    print(f"Removed {len(df) - len(cleaned_df)} outliers from {column}.")
    return cleaned_df

print("Original Data Shape:", df.shape)

# Remove outliers from the 'Fuel_Efficiency' column and reassign to df
df = remove_outliers_iqr(df, 'Fuel_Efficiency')
print("Cleaned Data Shape:", df.shape)

df.head()


Original Data Shape: (1500, 11)
Removed 98 outliers from Fuel_Efficiency.
Cleaned Data Shape: (1402, 11)


Unnamed: 0,Implement,Load_Percentage,Speed,Area_Covered,HP_of_Tractor,Working_Depth,Working_Width,Fuel_Consumed,Fuel_Efficiency,Soil_Type,Terrain_Condition
1,Harrow,70.315759,7.217708,1.985688,74.262975,15.927367,,21.92,11.04,Silty,Slightly Hilly
2,Seeder,78.35607,5.990942,1.3571,42.659111,,2.289578,16.3,12.01,Silty,Slightly Hilly
3,Seeder,96.368562,2.783136,1.777399,38.748544,,3.347624,14.05,7.9,Clay,Hilly
4,Seeder,99.960055,4.831449,2.106862,90.190487,,1.863376,43.28,20.54,Clay,Hilly
5,Sprayer,89.464749,3.255823,3.156366,90.438543,,2.78464,85.28,27.02,Clay,Very Hilly


In [8]:
df.to_csv('dataset.csv')

In [9]:
df.shape

(1402, 11)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1402 entries, 1 to 1499
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Implement          1402 non-null   object 
 1   Load_Percentage    1402 non-null   float64
 2   Speed              1402 non-null   float64
 3   Area_Covered       1402 non-null   float64
 4   HP_of_Tractor      1402 non-null   float64
 5   Working_Depth      936 non-null    float64
 6   Working_Width      466 non-null    float64
 7   Fuel_Consumed      1402 non-null   float64
 8   Fuel_Efficiency    1402 non-null   float64
 9   Soil_Type          1402 non-null   object 
 10  Terrain_Condition  1402 non-null   object 
dtypes: float64(8), object(3)
memory usage: 131.4+ KB


In [11]:
# Prepare the features and target variables
features = df[["Load_Percentage", "Speed", "Area_Covered", "HP_of_Tractor", "Working_Depth", "Working_Width", "Soil_Type", "Terrain_Condition"]]
target_fuel_consumed = df["Fuel_Consumed"]
target_fuel_efficiency = df["Fuel_Efficiency"]



In [12]:
# Fill NaNs for Working_Width with 0 for Sprayer and Seeder
features.fillna(0, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features.fillna(0, inplace=True)


In [13]:
# Split the data into training and testing sets
X_train, X_test, y_train_fuel, y_test_fuel = train_test_split(features, target_fuel_consumed, test_size=0.2, random_state=42)
_, _, y_train_efficiency, y_test_efficiency = train_test_split(features, target_fuel_efficiency, test_size=0.2, random_state=42)


In [14]:
# Define categorical and numerical columns
categorical_cols = ["Soil_Type", "Terrain_Condition"]
numerical_cols = ["Load_Percentage", "Speed", "Area_Covered", "HP_of_Tractor", "Working_Depth", "Working_Width"]



In [15]:
# Create a preprocessor that applies OneHotEncoder to categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_cols),
        ('cat', OneHotEncoder(drop='first'), categorical_cols)
    ]
)




In [16]:
# Define models and hyperparameters for GridSearchCV
models = {
    'Random Forest': RandomForestRegressor(),
    'Linear Regression': LinearRegression(),
    'Support Vector Regressor': SVR()
}

params = {
    'Random Forest': {
        'model__n_estimators': [50, 100, 200],
        'model__max_depth': [None, 10, 20, 30]
    },
    'Linear Regression': {
        # No hyperparameters to tune
    },
    'Support Vector Regressor': {
        'model__C': [0.1, 1, 10],
        'model__kernel': ['linear', 'rbf']
    }
}


In [17]:
best_models = {}
results = {}

In [18]:
# Perform GridSearchCV for each model
for model_name, model in models.items():
    # Create a pipeline that combines preprocessing and model training
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    param_grid = params[model_name]
    
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
    grid_search.fit(X_train, y_train_fuel)
    
    best_models[model_name] = grid_search.best_estimator_
    results[model_name] = grid_search.best_params_



In [19]:
# Predictions and evaluation for best fuel consumption model
predictions_fuel = best_models['Random Forest'].predict(X_test) 
mae_fuel = mean_absolute_error(y_test_fuel, predictions_fuel)
r2_fuel = r2_score(y_test_fuel, predictions_fuel)



In [20]:
# Display results
print("Best Models and Parameters for Fuel Consumption Prediction:")
for model_name, params in results.items():
    print(f"{model_name}: {params}")

print(f"\nFuel Consumption Model - MAE: {mae_fuel:.2f}, R^2: {r2_fuel:.2f}")


Best Models and Parameters for Fuel Consumption Prediction:
Random Forest: {'model__max_depth': None, 'model__n_estimators': 100}
Linear Regression: {}
Support Vector Regressor: {'model__C': 10, 'model__kernel': 'linear'}

Fuel Consumption Model - MAE: 7.45, R^2: 0.83


In [21]:
import pickle
import joblib
import h5py

# Save the trained pipeline as a .pkl file
with open('fuel_efficiency_pipeline.pkl', 'wb') as f:
    pickle.dump(best_models['Random Forest'], f)

# Save the RandomForestRegressor model in .h5 format 
model = best_models['Random Forest'].named_steps['model']

# Convert to h5 format using joblib (model must be compatible)
joblib.dump(model, 'fuel_efficiency_model.pkl')

# Confirmation
print("Models and pipeline have been successfully exported.")


Models and pipeline have been successfully exported.


In [22]:
import numpy as np
import pandas as pd
import sklearn

# Print versions
print("NumPy version:", np.__version__)
print("Pandas version:", pd.__version__)
print("scikit-learn version:", sklearn.__version__)


NumPy version: 1.26.4
Pandas version: 2.1.4
scikit-learn version: 1.2.2
