In [None]:
df = pd.read_csv("yield.csv")
df.head()

In [None]:
df.dropna(inplace=True)

In [None]:
# Encode categorical variables
from sklearn.preprocessing import StandardScaler, LabelEncoder
label_enc_district = LabelEncoder()
label_enc_crop = LabelEncoder()
df['District_Code'] = label_enc_district.fit_transform(df['District_Name'])
df['Crop_Code'] = label_enc_crop.fit_transform(df['Crop'])

# Select relevant columns
df = df[['District_Name', 'Crop', 'Area', 'Production']]  # Assuming these columns exist

In [None]:
df['Yield'] = df['Production'] / df['Area']

In [None]:
df['District_Name'] = df['District_Name'].astype(str).str.strip()
df['Crop'] = df['Crop'].astype(str).str.strip()
df['District_Code'] = label_enc_district.fit_transform(df['District_Name'])
df['Crop_Code'] = label_enc_crop.fit_transform(df['Crop'])

print(df.columns)  # Check available columns
print(df.head())   # Check if 'District_Code' and 'Crop_Code' are present

In [None]:

# Split dataset into features and target
X = df[['District_Code', 'Crop_Code']]
y = df['Yield']


In [None]:

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(df[['District_Name', 'District_Code', 'Crop', 'Crop_Code']].head())


In [None]:

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_error, mean_squared_error
models = {
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "Linear Regression": LinearRegression()
}

best_model = None
best_mae = float("inf")

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    mae = mean_absolute_error(y_test, y_pred)
    print(f"{name} - Mean Absolute Error: {mae}")
    if mae < best_mae:
        best_mae = mae
        best_model = model

print(f"Best Model: {best_model}")

In [None]:


# Train Random Forest model
model = RandomForestRegressor()
model.fit(X_train_scaled, y_train)


In [None]:

# Evaluate model
y_pred = model.predict(X_test_scaled)

print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))

In [None]:

# Function for prediction
def predict_yield(district_name, crop_name, area):
    # Check if the district and crop exist in the trained encoders
    if district_name not in label_enc_district.classes_:
        raise ValueError(f"District '{district_name}' not found in training data.")
    if crop_name not in label_enc_crop.classes_:
        raise ValueError(f"Crop '{crop_name}' not found in training data.")
    
    # Convert district and crop names to numerical labels
    district_num = label_enc_district.transform([district_name])[0]
    crop_num = label_enc_crop.transform([crop_name])[0]
    
    # Prepare input and scale it
    input_features = np.array([[district_num, crop_num]])
    input_features_scaled = scaler.transform(input_features)
    
    # Predict yield
    predicted_yield = model.predict(input_features_scaled)
    return (predicted_yield[0])*area


In [None]:
# Save the model and scaler
import pickle
pickle.dump(model, open("yieldmodel.pkl", "wb"))
pickle.dump(scaler, open("scaler.pkl", "wb"))

In [None]:
print(predict_yield("ARIYALUR", "Rice",1))