In [1]:
import pandas as pd
import numpy as np

# Set a seed for reproducibility
np.random.seed(42)

# Number of records
num_records = 10000

# Generate synthetic data for existing features
temperature = np.random.uniform(10, 40, num_records)  # Temperature between 10 and 40°C
rainfall = np.random.uniform(100, 1000, num_records)  # Rainfall between 100 and 1000 mm
soil_quality = np.random.randint(1, 11, num_records)  # Soil quality index between 1 and 10
fertilizer_use = np.random.uniform(50, 300, num_records)  # Fertilizer use between 50 and 300 kg/hectare
humidity = np.random.uniform(30, 90, num_records)  # Humidity between 30 and 90%
pesticide_use = np.random.uniform(0, 50, num_records)  # Pesticide use between 0 and 50 kg/hectare
sunlight_hours = np.random.uniform(1000, 3000, num_records)  # Sunlight hours between 1000 and 3000
plant_density = np.random.uniform(5000, 30000, num_records)  # Plant density between 5000 and 30000 plants/hectare
irrigation = np.random.uniform(0, 500, num_records)  # Irrigation water between 0 and 500 mm
crop_type = np.random.choice(['Wheat', 'Corn', 'Soybean', 'Rice'], num_records)  # Randomly choosing crop types
farm_area = np.random.uniform(1, 100, num_records)  # Farm area between 1 and 100 hectares

# Assuming a more complex relationship with some noise for crop yield
crop_yield = (
    0.25 * temperature +
    0.15 * rainfall +
    1.2 * soil_quality +
    0.35 * fertilizer_use +
    0.2 * humidity +
    0.1 * pesticide_use +
    0.25 * sunlight_hours +
    0.3 * (plant_density / 10000) +
    0.2 * irrigation +
    0.05 * farm_area +  # Adding influence of farm area
    np.random.normal(0, 10, num_records)  # Adding some noise
)

# Create a DataFrame
data = pd.DataFrame({
    'temperature': temperature,
    'rainfall': rainfall,
    'soil_quality': soil_quality,
    'fertilizer_use': fertilizer_use,
    'humidity': humidity,
    'pesticide_use': pesticide_use,
    'sunlight_hours': sunlight_hours,
    'plant_density': plant_density,
    'irrigation': irrigation,
    'crop_type': crop_type,
    'farm_area': farm_area,
    'crop_yield': crop_yield
})

# Save the dataset to a CSV file
data.to_csv('synthetic_crop_yield_data_with_farm_area.csv', index=False)

In [2]:
df = data
df

Unnamed: 0,temperature,rainfall,soil_quality,fertilizer_use,humidity,pesticide_use,sunlight_hours,plant_density,irrigation,crop_type,farm_area,crop_yield
0,21.236204,436.276737,6,177.685143,52.032843,39.042416,1833.475971,26877.056778,441.685442,Wheat,29.430887,694.905745
1,38.521429,399.620887,2,126.260303,89.850900,24.781501,1592.618609,14578.437798,393.287081,Soybean,23.893473,622.557071
2,31.959818,258.538521,4,177.801207,68.109075,15.893629,1230.163861,13929.584174,420.933349,Soybean,92.380189,513.571902
3,27.959755,646.540003,7,269.746380,71.728352,11.822656,1344.350809,18341.806110,418.670916,Wheat,22.448011,646.856935
4,14.680559,528.961744,6,222.851587,44.300829,5.851395,1699.811973,16285.311371,37.389654,Soybean,10.055742,615.551133
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,35.729680,889.334766,7,298.909099,36.112842,48.455382,2881.600079,18285.570166,386.179757,Soybean,95.674909,1069.008679
9996,36.925265,142.132562,8,122.463617,77.386463,22.347092,2937.526445,17413.910911,254.042679,Corn,44.368234,883.853253
9997,38.401237,373.328622,7,296.162172,32.809687,9.469092,2945.345034,15873.577117,69.253301,Wheat,3.816522,953.298367
9998,21.924640,498.988006,3,130.099137,31.028194,1.294311,1314.762604,12265.410820,448.135961,Corn,40.709600,533.569742


In [3]:
df.head()

Unnamed: 0,temperature,rainfall,soil_quality,fertilizer_use,humidity,pesticide_use,sunlight_hours,plant_density,irrigation,crop_type,farm_area,crop_yield
0,21.236204,436.276737,6,177.685143,52.032843,39.042416,1833.475971,26877.056778,441.685442,Wheat,29.430887,694.905745
1,38.521429,399.620887,2,126.260303,89.8509,24.781501,1592.618609,14578.437798,393.287081,Soybean,23.893473,622.557071
2,31.959818,258.538521,4,177.801207,68.109075,15.893629,1230.163861,13929.584174,420.933349,Soybean,92.380189,513.571902
3,27.959755,646.540003,7,269.74638,71.728352,11.822656,1344.350809,18341.80611,418.670916,Wheat,22.448011,646.856935
4,14.680559,528.961744,6,222.851587,44.300829,5.851395,1699.811973,16285.311371,37.389654,Soybean,10.055742,615.551133


In [4]:
df.describe()

Unnamed: 0,temperature,rainfall,soil_quality,fertilizer_use,humidity,pesticide_use,sunlight_hours,plant_density,irrigation,farm_area,crop_yield
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,24.824787,554.076889,5.5637,174.325343,60.184569,25.246392,2005.784425,17453.736326,250.710635,50.543996,726.227414
std,8.628904,260.365096,2.848465,72.058655,17.319765,14.423698,580.207369,7259.499436,143.491473,28.625634,154.387123
min,10.000349,100.14197,1.0,50.025593,30.005596,0.006448,1000.095078,5003.648485,0.139808,1.022661,333.555348
25%,17.389866,328.551221,3.0,112.837597,45.292827,12.809923,1507.673478,11191.225044,127.163092,25.546658,601.384279
50%,24.775859,555.307099,6.0,174.258859,60.165623,25.229455,1998.607515,17352.255952,250.237516,50.84339,726.229666
75%,32.20019,780.831299,8.0,236.664835,75.195904,37.893993,2516.389592,23817.173319,374.404476,75.069806,851.34729
max,39.99153,999.932344,10.0,299.993467,89.987827,49.993077,2999.790083,29997.322914,499.85173,99.991286,1095.876088


In [5]:
df.isna().sum()/len(df)*100 # Checking null records from the dataset

temperature       0.0
rainfall          0.0
soil_quality      0.0
fertilizer_use    0.0
humidity          0.0
pesticide_use     0.0
sunlight_hours    0.0
plant_density     0.0
irrigation        0.0
crop_type         0.0
farm_area         0.0
crop_yield        0.0
dtype: float64

In [6]:
df.skew(numeric_only = True) # Checking skewness of the columns

temperature       0.024880
rainfall         -0.018445
soil_quality     -0.018707
fertilizer_use    0.009307
humidity         -0.004843
pesticide_use    -0.022265
sunlight_hours    0.000247
plant_density     0.012273
irrigation        0.003098
farm_area        -0.008353
crop_yield       -0.012323
dtype: float64

In [7]:
df.shape

(10000, 12)

In [8]:
df.columns

Index(['temperature', 'rainfall', 'soil_quality', 'fertilizer_use', 'humidity',
       'pesticide_use', 'sunlight_hours', 'plant_density', 'irrigation',
       'crop_type', 'farm_area', 'crop_yield'],
      dtype='object')

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   temperature     10000 non-null  float64
 1   rainfall        10000 non-null  float64
 2   soil_quality    10000 non-null  int64  
 3   fertilizer_use  10000 non-null  float64
 4   humidity        10000 non-null  float64
 5   pesticide_use   10000 non-null  float64
 6   sunlight_hours  10000 non-null  float64
 7   plant_density   10000 non-null  float64
 8   irrigation      10000 non-null  float64
 9   crop_type       10000 non-null  object 
 10  farm_area       10000 non-null  float64
 11  crop_yield      10000 non-null  float64
dtypes: float64(10), int64(1), object(1)
memory usage: 937.6+ KB


In [10]:
df.describe(include = "all")

Unnamed: 0,temperature,rainfall,soil_quality,fertilizer_use,humidity,pesticide_use,sunlight_hours,plant_density,irrigation,crop_type,farm_area,crop_yield
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000,10000.0,10000.0
unique,,,,,,,,,,4,,
top,,,,,,,,,,Rice,,
freq,,,,,,,,,,2520,,
mean,24.824787,554.076889,5.5637,174.325343,60.184569,25.246392,2005.784425,17453.736326,250.710635,,50.543996,726.227414
std,8.628904,260.365096,2.848465,72.058655,17.319765,14.423698,580.207369,7259.499436,143.491473,,28.625634,154.387123
min,10.000349,100.14197,1.0,50.025593,30.005596,0.006448,1000.095078,5003.648485,0.139808,,1.022661,333.555348
25%,17.389866,328.551221,3.0,112.837597,45.292827,12.809923,1507.673478,11191.225044,127.163092,,25.546658,601.384279
50%,24.775859,555.307099,6.0,174.258859,60.165623,25.229455,1998.607515,17352.255952,250.237516,,50.84339,726.229666
75%,32.20019,780.831299,8.0,236.664835,75.195904,37.893993,2516.389592,23817.173319,374.404476,,75.069806,851.34729


In [11]:
df.nunique()

temperature       10000
rainfall          10000
soil_quality         10
fertilizer_use    10000
humidity          10000
pesticide_use     10000
sunlight_hours    10000
plant_density     10000
irrigation        10000
crop_type             4
farm_area         10000
crop_yield        10000
dtype: int64

In [12]:
df["crop_type"].value_counts()

crop_type
Rice       2520
Corn       2512
Soybean    2489
Wheat      2479
Name: count, dtype: int64

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
#from sklearn.inspection import partial_dependence, plot_partial_dependence
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [17]:
X = df.drop(columns=['crop_yield'])
y = df['crop_yield']
print(X)
print(y)

      temperature    rainfall  soil_quality  fertilizer_use   humidity  \
0       21.236204  436.276737             6      177.685143  52.032843   
1       38.521429  399.620887             2      126.260303  89.850900   
2       31.959818  258.538521             4      177.801207  68.109075   
3       27.959755  646.540003             7      269.746380  71.728352   
4       14.680559  528.961744             6      222.851587  44.300829   
...           ...         ...           ...             ...        ...   
9995    35.729680  889.334766             7      298.909099  36.112842   
9996    36.925265  142.132562             8      122.463617  77.386463   
9997    38.401237  373.328622             7      296.162172  32.809687   
9998    21.924640  498.988006             3      130.099137  31.028194   
9999    16.514212  255.038333             6       52.930779  73.949921   

      pesticide_use  sunlight_hours  plant_density  irrigation crop_type  \
0         39.042416     1833.475971

In [18]:
# Divide data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
categorical_features = ['crop_type']
numerical_features = [col for col in X.columns if col not in categorical_features]
print(categorical_features, numerical_features)

['crop_type'] ['temperature', 'rainfall', 'soil_quality', 'fertilizer_use', 'humidity', 'pesticide_use', 'sunlight_hours', 'plant_density', 'irrigation', 'farm_area']


In [22]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ])

In [27]:
import pickle
# Create separate pipelines for each model
model_lr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

model_ridge = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('ridge', Ridge(alpha=0.5))
])

model_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42))
])

model_gb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('gb', GradientBoostingRegressor(n_estimators=100, random_state=42))
])

# # Train each model separately
# model_lr.fit(X_train, y_train)
# model_ridge.fit(X_train, y_train)
# model_rf.fit(X_train, y_train)
# model_gb.fit(X_train, y_train)

In [31]:
from sklearn.metrics import mean_squared_error, r2_score
models = [model_lr, model_ridge, model_rf, model_gb]
model_names = ["Linear Regression", "Ridge Regression", "Random Forest", "Gradient Boosting"]
best_model = None
best_score = float("inf")

for model, name in zip(models, model_names):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"\n{name}:")
    print("  Mean Squared Error:", mse)
    print("  R-squared:", r2)

    if mse < best_score:
        best_model = model
        best_score = mse

print("\nBest Model:")
print(best_model)


Linear Regression:
  Mean Squared Error: 103.91161754343516
  R-squared: 0.9954803039430735

Ridge Regression:
  Mean Squared Error: 103.91151775733182
  R-squared: 0.9954803082833279

Random Forest:
  Mean Squared Error: 232.53689056578943
  R-squared: 0.9898856731111819

Gradient Boosting:
  Mean Squared Error: 166.80942967472936
  R-squared: 0.9927445271338993

Best Model:
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', 'passthrough',
                                                  ['temperature', 'rainfall',
                                                   'soil_quality',
                                                   'fertilizer_use', 'humidity',
                                                   'pesticide_use',
                                                   'sunlight_hours',
                                                   'plant_density',
                                                   'irrigation', 'farm_area']),
    

In [32]:
model_filename = 'best_model.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(best_model, file)
print(f"\nBest model saved as {model_filename} with Mean Squared Error: {best_score}")


Best model saved as best_model.pkl with Mean Squared Error: 103.91151775733182
