In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings("ignore")

In [3]:
data = pd.read_csv('IIRS-Punjab4 - Sheet1.csv')
data

Unnamed: 0,Year,Name of District,Acreage (in thousand hectares),Average NDVI,Production (in per 1000 tonnes),Average Temperature(in degree celcius),Rainfall (in mm),Humidity (in %)
0,2023,Amritsar,186.72,0.3780,,18.52,,73.43
1,2023,Barnala,111.91,0.3980,,18.86,,77.86
2,2023,Bathinda,256.00,0.3900,,20.00,,48.71
3,2023,Faridkot,116.00,0.3937,,28.00,,75.30
4,2023,Fatehgarh Sahib,84.00,0.3696,,21.45,,67.74
...,...,...,...,...,...,...,...,...
225,2014,Rupnagar,189.00,,858.0,,159.5,
226,2014,Sahibzada Ajit Singh Nagar,50.00,,215.0,,191.6,
227,2014,Sangrur,284.00,,1567.0,18.57,87.2,75.14
228,2014,Shahid Bhagat Singh Nagar,73.80,,371.0,18.57,191.4,75.14


In [4]:
data.columns

Index(['Year', 'Name of District', 'Acreage (in thousand hectares)',
       'Average NDVI', 'Production (in per 1000 tonnes)',
       'Average Temperature(in degree celcius)', 'Rainfall (in mm)',
       'Humidity (in %)'],
      dtype='object')

In [5]:
data.drop(['Year','Name of District'],axis=1, inplace=True)

In [6]:
data

Unnamed: 0,Acreage (in thousand hectares),Average NDVI,Production (in per 1000 tonnes),Average Temperature(in degree celcius),Rainfall (in mm),Humidity (in %)
0,186.72,0.3780,,18.52,,73.43
1,111.91,0.3980,,18.86,,77.86
2,256.00,0.3900,,20.00,,48.71
3,116.00,0.3937,,28.00,,75.30
4,84.00,0.3696,,21.45,,67.74
...,...,...,...,...,...,...
225,189.00,,858.0,,159.5,
226,50.00,,215.0,,191.6,
227,284.00,,1567.0,18.57,87.2,75.14
228,73.80,,371.0,18.57,191.4,75.14


In [7]:
data.info

<bound method DataFrame.info of      Acreage (in thousand hectares)  Average NDVI  \
0                            186.72        0.3780   
1                            111.91        0.3980   
2                            256.00        0.3900   
3                            116.00        0.3937   
4                             84.00        0.3696   
..                              ...           ...   
225                          189.00           NaN   
226                           50.00           NaN   
227                          284.00           NaN   
228                           73.80           NaN   
229                          183.00           NaN   

     Production (in per 1000 tonnes)  Average Temperature(in degree celcius)  \
0                                NaN                                   18.52   
1                                NaN                                   18.86   
2                                NaN                                   20.00   
3          

In [8]:
data.isnull().sum()

Acreage (in thousand hectares)             8
Average NDVI                              57
Production (in per 1000 tonnes)           25
Average Temperature(in degree celcius)    19
Rainfall (in mm)                          66
Humidity (in %)                           31
dtype: int64

In [9]:
X = data.drop('Production (in per 1000 tonnes)', axis=1)
Y = data['Production (in per 1000 tonnes)']

In [10]:
X = X.fillna(X.median())

if Y.isnull().any():
    Y = Y.fillna(Y.median())

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [12]:
models = [
    ('Random Forest', RandomForestRegressor(), {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
    }),
    ('Gradient Boosting', GradientBoostingRegressor(), {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
    }),
    ('AdaBoost', AdaBoostRegressor(base_estimator=DecisionTreeRegressor()), {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
    }),
    ('Bagging', BaggingRegressor(base_estimator=DecisionTreeRegressor()), {
        'n_estimators': [10, 20, 30],
        'max_samples': [0.5, 0.7, 0.9],
        'max_features': [0.5, 0.7, 0.9],
    }),
    ('Support Vector Machine', SVR(), {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'poly', 'rbf'],
    }),
    ('K-Nearest Neighbors', KNeighborsRegressor(), {
        'n_neighbors': [3, 5, 7],
        'weights': ['uniform', 'distance'],
    }),
    ('Lasso Regression', Lasso(), {
        'alpha': [0.001, 0.01, 0.1, 1],
    }),
    ('Ridge Regression', Ridge(), {
        'alpha': [0.001, 0.01, 0.1, 1],
    }),
    ('Linear Regression', LinearRegression(), {}),
    ('XGBoost Regressor', XGBRegressor(), {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
    }),
]

In [13]:
results = {}

for name, model, param_grid in models:
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, Y_train)
    
    best_model = grid_search.best_estimator_
    Y_pred = best_model.predict(X_test)
    
    mse = mean_squared_error(Y_test, Y_pred)
    r2 = r2_score(Y_test, Y_pred)
    
    results[name] = {
        'Best Model': best_model,
        'Mean Squared Error': mse,
        'R-squared': r2,
    }

for name, metrics in results.items():
    print(f"Model: {name}")
    print(f"Best Model: {metrics['Best Model']}")
    print(f"Mean Squared Error: {metrics['Mean Squared Error']}")
    print(f"R-squared: {metrics['R-squared']}")
    print("\n")

best_model_name = min(results, key=lambda model: results[model]['Mean Squared Error'])
best_model = results[best_model_name]['Best Model']
print(f"The best model is {best_model_name}")

Model: Random Forest
Best Model: RandomForestRegressor(max_depth=20)
Mean Squared Error: 17761.187788158302
R-squared: 0.8801377976502252


Model: Gradient Boosting
Best Model: GradientBoostingRegressor(learning_rate=0.01, n_estimators=200)
Mean Squared Error: 21971.89082340116
R-squared: 0.8517216722612697


Model: AdaBoost
Best Model: AdaBoostRegressor(base_estimator=DecisionTreeRegressor(), learning_rate=0.01,
                  n_estimators=200)
Mean Squared Error: 16504.261095652175
R-squared: 0.8886202259288366


Model: Bagging
Best Model: BaggingRegressor(base_estimator=DecisionTreeRegressor(), max_features=0.9,
                 max_samples=0.5, n_estimators=30)
Mean Squared Error: 19137.953521549913
R-squared: 0.8708466300271879


Model: Support Vector Machine
Best Model: SVR(C=1, kernel='linear')
Mean Squared Error: 21598.255295749343
R-squared: 0.8542431690076937


Model: K-Nearest Neighbors
Best Model: KNeighborsRegressor(n_neighbors=3)
Mean Squared Error: 24444.87299178744
R

In [16]:
def predict_production(best_model, input_features):
    production_predictions = best_model.predict(input_features)

    return production_predictions

input_data = {}
for feature in X.columns:
    value = float(input(f"Enter the value for {feature}: "))
    input_data[feature] = [value]

input_features = pd.DataFrame(input_data)

predicted_production = predict_production(best_model, input_features)
print(f"Predicted Production: {predicted_production[0]:.2f} per 1000 tonnes")

Enter the value for Acreage (in thousand hectares): 174
Enter the value for Average NDVI: 0.3143
Enter the value for Average Temperature(in degree celcius): 28.14
Enter the value for Rainfall (in mm): 166.8
Enter the value for Humidity (in %): 44.42
Predicted Production: 864.44 per 1000 tonnes
