In [1]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
# Load files
data_path = ''
train = pd.read_csv(data_path + 'Train.csv')
test = pd.read_csv(data_path + 'Test.csv')
sample_submission = pd.read_csv(data_path + 'SampleSubmission.csv')
var_desc = pd.read_csv(data_path + 'VariableDescription.csv')

In [None]:
# Preview files
train.head()

In [None]:
test.head()

In [None]:
sample_submission.head()

### Univariate Disturbutions

In [None]:
for column in train.columns:
    try:
        sns.histplot(data=train, x=column, kde=True)
        plt.title(f'Univariate Distribution of {column}')
        plt.show()
    except Exception as e:
        print(f"An error occurred for column {column}: {e}")
        continue  # 

In [None]:
sns.pairplot(train)
plt.show()

In [None]:
train.info()

In [None]:
train.isnull().sum()

In [None]:
train.shape

### HANDLING MISSING VALUES

In [3]:
numerical_cols = train.select_dtypes(include=['number'])
train[numerical_cols.columns] = train[numerical_cols.columns].fillna(train[numerical_cols.columns].median())
train.isnull().sum()

ID                                       0
District                                 0
Block                                    0
CultLand                                 0
CropCultLand                             0
LandPreparationMethod                    0
CropTillageDate                          0
CropTillageDepth                         0
CropEstMethod                            0
RcNursEstDate                           83
SeedingSowingTransplanting               0
SeedlingsPerPit                          0
NursDetFactor                          289
TransDetFactor                         289
TransplantingIrrigationHours             0
TransplantingIrrigationSource          115
TransplantingIrrigationPowerSource     503
TransIrriCost                            0
StandingWater                            0
OrgFertilizers                        1335
Ganaura                                  0
CropOrgFYM                               0
PCropSolidOrgFertAppMethod            1337
NoFertilize

In [4]:
train.isnull().sum()
train['MineralFertAppMethod2'] = train['MineralFertAppMethod.1']
train = train.drop(columns='MineralFertAppMethod.1') 

mode_value = train['MineralFertAppMethod2'].mode()[0]
train['MineralFertAppMethod2'].fillna(mode_value, inplace=True)

mode_value = train['PCropSolidOrgFertAppMethod'].mode()[0]
train['PCropSolidOrgFertAppMethod'].fillna(mode_value, inplace=True)

mode_value = train['TransplantingIrrigationSource'].mode()[0]
train['TransplantingIrrigationSource'].fillna(mode_value, inplace=True)

mode_value = train['TransplantingIrrigationPowerSource'].mode()[0]
train['TransplantingIrrigationPowerSource'].fillna(mode_value, inplace=True)

train.isnull().sum()

ID                                       0
District                                 0
Block                                    0
CultLand                                 0
CropCultLand                             0
LandPreparationMethod                    0
CropTillageDate                          0
CropTillageDepth                         0
CropEstMethod                            0
RcNursEstDate                           83
SeedingSowingTransplanting               0
SeedlingsPerPit                          0
NursDetFactor                          289
TransDetFactor                         289
TransplantingIrrigationHours             0
TransplantingIrrigationSource            0
TransplantingIrrigationPowerSource       0
TransIrriCost                            0
StandingWater                            0
OrgFertilizers                        1335
Ganaura                                  0
CropOrgFYM                               0
PCropSolidOrgFertAppMethod               0
NoFertilize

In [6]:
columns_with_missing = ['NursDetFactor', 'TransDetFactor', 'OrgFertilizers', 'CropbasalFerts', 'FirstTopDressFert']

# Iterate through each column and impute missing values with the mode
for column in columns_with_missing:
    mode_value = train[column].mode()[0]  # Calculate the mode
    train[column].fillna(mode_value, inplace=True)
    
train.isnull().sum()

ID                                     0
District                               0
Block                                  0
CultLand                               0
CropCultLand                           0
LandPreparationMethod                  0
CropTillageDate                        0
CropTillageDepth                       0
CropEstMethod                          0
RcNursEstDate                         83
SeedingSowingTransplanting             0
SeedlingsPerPit                        0
NursDetFactor                          0
TransDetFactor                         0
TransplantingIrrigationHours           0
TransplantingIrrigationSource          0
TransplantingIrrigationPowerSource     0
TransIrriCost                          0
StandingWater                          0
OrgFertilizers                         0
Ganaura                                0
CropOrgFYM                             0
PCropSolidOrgFertAppMethod             0
NoFertilizerAppln                      0
CropbasalFerts  

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming you have a DataFrame named 'df' with five date columns: 'Date1', 'Date2', 'Date3', 'Date4', 'Date5'

# Convert the date columns to datetime format (if not already done)
date_columns = ['CropTillageDate', 'RcNursEstDate', 'SeedingSowingTransplanting', 'Harv_date', 'Threshing_date']
for col in date_columns:
    train[col] = pd.to_datetime(train[col])

# Calculate the time differences between the date columns
for i in range(len(date_columns)):
    for j in range(i + 1, len(date_columns)):
        time_diff = (train[date_columns[j]] - train[date_columns[i]]).dt.days
        print(f"Time difference between {date_columns[i]} and {date_columns[j]}:")
        print(time_diff.describe())
        print("\n")

# Visualize the relationships between the date columns
plt.figure(figsize=(12, 8))
for i in range(len(date_columns)):
    for j in range(i + 1, len(date_columns)):
        plt.scatter(train[date_columns[i]], train[date_columns[j]], label=f"{date_columns[i]} vs. {date_columns[j]}")

plt.xlabel("Date")
plt.ylabel("Date")
plt.title("Relationships between Date Columns")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
date_columns = ['CropTillageDate', 'RcNursEstDate', 'Harv_date', 'Threshing_date']
for col in date_columns:
    train[col] = pd.to_datetime(train[col])

# Create line plots for the four date columns
plt.figure(figsize=(12, 6))

for col in date_columns:
    plt.plot(train[col], label=col)

plt.xlabel("Date")
plt.ylabel("Value")
plt.title("Date Columns Over Time")
plt.legend()
plt.grid(True)
plt.show()

In [7]:
train['CropTillageDate'] = pd.to_datetime(train['CropTillageDate'])

# Create a mask to identify missing values in 'RcNursEstDate'
missing_mask = pd.isnull(train['RcNursEstDate'])

# Subtract 15 days from 'CropTillageDate' only for missing values in 'RcNursEstDate'
train.loc[missing_mask, 'RcNursEstDate'] = train.loc[missing_mask, 'CropTillageDate'] - pd.to_timedelta(15, unit='D')

train.isnull().sum()

ID                                    0
District                              0
Block                                 0
CultLand                              0
CropCultLand                          0
LandPreparationMethod                 0
CropTillageDate                       0
CropTillageDepth                      0
CropEstMethod                         0
RcNursEstDate                         0
SeedingSowingTransplanting            0
SeedlingsPerPit                       0
NursDetFactor                         0
TransDetFactor                        0
TransplantingIrrigationHours          0
TransplantingIrrigationSource         0
TransplantingIrrigationPowerSource    0
TransIrriCost                         0
StandingWater                         0
OrgFertilizers                        0
Ganaura                               0
CropOrgFYM                            0
PCropSolidOrgFertAppMethod            0
NoFertilizerAppln                     0
CropbasalFerts                        0


In [8]:
train = pd.get_dummies(train, columns=["District", "CropEstMethod", "TransplantingIrrigationSource"], prefix=["District", "CropEstMethod", "TransplantingIrrigationSource"])


additional_categorical_columns = ["TransplantingIrrigationPowerSource", "PCropSolidOrgFertAppMethod", "MineralFertAppMethod", "MineralFertAppMethod2", "Harv_method", "Threshing_method", "Stubble_use"]

train = pd.get_dummies(train, columns=additional_categorical_columns, prefix=additional_categorical_columns)
train.isnull().sum()


ID                            0
Block                         0
CultLand                      0
CropCultLand                  0
LandPreparationMethod         0
                             ..
Harv_method_machine           0
Threshing_method_hand         0
Threshing_method_machine      0
Stubble_use_burned            0
Stubble_use_plowed_in_soil    0
Length: 67, dtype: int64

In [13]:
train = pd.get_dummies(train, columns=["Block"], prefix=["Block"])


In [19]:
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.preprocessing import StandardScaler

# Split the data into training and testing sets
X = train.drop(['ID', 'Yield'], axis = 1)
X =X.select_dtypes(include=np.number)
y = train.Yield
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features (recommended for KNN)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the hyperparameters to tune
param_grid = {
    'n_neighbors': [3, 5, 7, 9],  # Adjust the number of neighbors
    'weights': ['uniform', 'distance'],  # Adjust the weight function
    'p': [1, 2]  # 1 for Manhattan distance, 2 for Euclidean distance
}

# Create the KNN regressor model
knn = KNeighborsRegressor()

# Define the scoring metric (e.g., mean squared error)
scoring = make_scorer(mean_squared_error, greater_is_better=False)

# Perform grid search to find the best hyperparameters
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring=scoring, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Create a KNN model with the best hyperparameters
best_knn = KNeighborsRegressor(**best_params)

# Fit the model on the training data
best_knn.fit(X_train, y_train)

# Make predictions on the test data
y_pred = best_knn.predict(X_test)

# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Best KNN model's RMSE: {rmse}")




AttributeError: 'NoneType' object has no attribute 'split'

In [11]:



X = train.drop(['ID', 'Yield'], axis = 1)
#X =X.select_dtypes(include=np.number)
y = train.Yield

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1234)

# Instantiate model
model = RandomForestRegressor(random_state = 1234)

# Fit model
model.fit(X_train.fillna(0), y_train)

# Make predictions
preds = model.predict(X_test.fillna(0))

# Measure model performance
mean_squared_error(y_test, preds, squared=False)

372.8991448298269

In [None]:
# Make predictions on the Zindi test set
test_df = test[X.columns]
preds = model.predict(test_df.fillna(0))

# Create submisiion file to be uploaded to Zindi for scoring
sub = pd.DataFrame({'ID': test.ID, 'Yield': preds})
sub.to_csv('BenchmarkSubmission.csv', index = False)

sub.head()

### FEATURE ENGINEERING

In [None]:
train["NoFertilizerAppln_per_Acre"] = train["NoFertilizerAppln"] / train["Acre"]
train["StandingWater_per_Acre"] = train["StandingWater"] / train["Acre"]
train['CropCultLand_Percentage'] = (train['CropCultLand'] / train['CultLand']) * 100


In [None]:

train["RcNursEstDate"] = pd.to_datetime(train["RcNursEstDate"])
train["CropTillageDate"] = pd.to_datetime(train["CropTillageDate"])
train["Harv_date"] = pd.to_datetime(train["Harv_date"])
train["SeedingSowingTransplanting"] = pd.to_datetime(train["SeedingSowingTransplanting"])

train["CropTillage_to_RcNursEstDuration"] = (train["RcNursEstDate"] - train["CropTillageDate"]).dt.days
train["RcNursEst_to_HarvestDuration"] = (train["Harv_date"] - train["RcNursEstDate"]).dt.days
train["SeedlingTransplanting_to_CropTillageDate"] = (train["SeedingSowingTransplanting"] - train["CropTillageDate"]).dt.days

In [None]:
# For test dataset
test["NoFertilizerAppln_per_Acre"] = test["NoFertilizerAppln"] / test["Acre"]
test["StandingWater_per_Acre"] = test["StandingWater"] / test["Acre"]
#test["Acre_Yield"] = test["Acre"] * test["Yield"]
test['CropCultLand_Percentage'] = (test['CropCultLand'] / test['CultLand']) * 100

# Convert date columns to datetime format
test["RcNursEstDate"] = pd.to_datetime(test["RcNursEstDate"])
test["CropTillageDate"] = pd.to_datetime(test["CropTillageDate"])
test["Harv_date"] = pd.to_datetime(test["Harv_date"])

# Calculate date differences
test["CropTillage_to_RcNursEstDuration"] = (test["RcNursEstDate"] - test["CropTillageDate"]).dt.days
test["RcNursEst_to_HarvestDuration"] = (test["Harv_date"] - test["RcNursEstDate"]).dt.days

In [None]:
train = pd.get_dummies(train, columns=["District", "CropEstMethod", "TransplantingIrrigationSource"], prefix=["District", "CropEstMethod", "TransplantingIrrigationSource"])

In [None]:

additional_categorical_columns = ["TransplantingIrrigationPowerSource", "PCropSolidOrgFertAppMethod", "MineralFertAppMethod", "MineralFertAppMethod2", "Harv_method", "Threshing_method", "Stubble_use"]

train = pd.get_dummies(train, columns=additional_categorical_columns, prefix=additional_categorical_columns)


In [None]:
train.isnull().sum()

In [None]:
# Split data for training and local testing

X = train.drop(['ID', 'Yield'], axis = 1)
X =X.select_dtypes(include=np.number)
y = train.Yield

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1234)

# Instantiate model
model = RandomForestRegressor(random_state = 1234)

# Fit model
model.fit(X_train.fillna(0), y_train)

# Make predictions
preds = model.predict(X_test.fillna(0))

# Measure model performance
mean_squared_error(y_test, preds, squared=False)

In [None]:
# Make predictions on the Zindi test set
test_df = test[X.columns]
preds = model.predict(test_df.fillna(0))

# Create submisiion file to be uploaded to Zindi for scoring
sub = pd.DataFrame({'ID': test.ID, 'Yield': preds})
sub.to_csv('BenchmarkSubmission.csv', index = False)

sub.head()

In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
#from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [None]:

X = train.drop(['ID', 'Yield'], axis = 1)
X =X.select_dtypes(include=np.number)
y = train.Yield
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
decision_tree_model = DecisionTreeClassifier()
decision_tree_model.fit(X_train, y_train)

In [None]:
adaboost_model = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=50)
adaboost_model.fit(X_train, y_train)


In [None]:
xgboost_model = XGBClassifier()
xgboost_model.fit(X_train, y_train)


In [None]:
lightgbm_model = LGBMClassifier()
lightgbm_model.fit(X_train, y_train)


In [None]:
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)


In [None]:
models = [decision_tree_model, adaboost_model, xgboost_model, lightgbm_model, knn_model]
model_names = ['Decision Tree', 'AdaBoost', 'XGBoost', 'LightGBM', 'KNN']

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
#from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1234)

# Define a list of models to train and evaluate
models = [
    ("Random Forest", RandomForestRegressor(random_state=1234)),
    ("Decision Tree", DecisionTreeRegressor(random_state=1234)),
    ("Gradient Boosting", GradientBoostingRegressor(random_state=1234)),
   # ("XGBoost", XGBRegressor(random_state=1234)),
    ("AdaBoost", AdaBoostRegressor(random_state=1234)),
    ("LightGBM", LGBMRegressor(random_state=1234)),
    ("K-Nearest Neighbors", KNeighborsRegressor(n_neighbors=5)),

   
]

# Train and evaluate each model
for model_name, model in models:
    # Fit the model
    model.fit(X_train.fillna(0), y_train)

    # Make predictions
    preds = model.predict(X_test.fillna(0))

    # Measure model performance
    rmse = mean_squared_error(y_test, preds, squared=False)
    print(f"{model_name} - Root Mean Squared Error: {rmse:.2f}")

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
#from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV

# Assuming you have your data in X and y

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1234)

# Define a list of models to train and evaluate
models = [
    ("Random Forest", RandomForestRegressor(random_state=1234)),
    ("Decision Tree", DecisionTreeRegressor(random_state=1234)),
    ("Gradient Boosting", GradientBoostingRegressor(random_state=1234)),
    #("XGBoost", XGBRegressor(random_state=1234)),
    ("AdaBoost", AdaBoostRegressor(random_state=1234)),
    ("LightGBM", LGBMRegressor(random_state=1234)),
   # ("K-Nearest Neighbors", KNeighborsRegressor(n_neighbors=5)),
]

# Define hyperparameters for tuning
param_grid = {
   "Random Forest": {
        "n_estimators": [200, 300, 400, 500],
        "max_depth": [None, 10, 20, 30],
        "max_features": ['auto', 'sqrt', 'log2', None, 0.5, 0.7],  
        "max_samples": [0.5, 0.7, 0.9, None],  
        "min_samples_split": [2, 5, 10],  
    },
    "Decision Tree": {
        "max_depth": [None, 10, 20, 30],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
        "max_features": ['auto', 'sqrt', 'log2', None],
        "criterion": ["gini", "entropy"],
    },
    "Gradient Boosting": {
        "n_estimators": [100, 200, 300],
        "learning_rate": [0.01, 0.1, 0.2],
         "max_depth": [3, 4, 5, 6],  
        "max_bin": [100, 200, 255, 300] 
    },
    "XGBoost": {
        "n_estimators": [100, 200, 300],
        "learning_rate": [0.01, 0.1, 0.2],
        "max_depth": [3, 4, 5, 6],  
        "max_bin": [100, 200, 255, 300]  
    },
    "AdaBoost": {
        "n_estimators": [50, 100, 200],
        "learning_rate": [0.01, 0.1, 0.2],
         "max_depth": [3, 4, 5, 6],  
        "max_bin": [100, 200, 255, 300],
    },
    "LightGBM": {
        "n_estimators": [100, 200, 300],
        "learning_rate": [0.01, 0.1, 0.2],
        "max_depth": [3, 4, 5, 6],  
        "max_bin": [100, 200, 255, 300], 
    },
    "K-Nearest Neighbors": {
        "n_neighbors": [3, 5, 7],
    },
}

# Train and evaluate each model with hyperparameter tuning
for model_name, model in models:
    param_grid[model_name]
    grid_search = GridSearchCV(model, param_grid[model_name], cv=5, scoring="neg_mean_squared_error")
    grid_search.fit(X_train.fillna(0), y_train)

    # Get the best estimator from the grid search
    best_model = grid_search.best_estimator_

    # Make predictions
    preds = best_model.predict(X_test.fillna(0))

    # Measure model performance
    rmse = mean_squared_error(y_test, preds, squared=False)
    print(f"{model_name} - Root Mean Squared Error: {rmse:.2f}")


In [None]:
columns_to_convert = ['LandPreparationMethod', 'NursDetFactor', 'TransDetFactor', 
 'OrgFertilizers' ,'CropbasalFerts','FirstTopDressFert' ]

# Create dummy variables for the specified columns
for column in columns_to_convert:
    categories = train[column].str.split(expand=True)
    categories = pd.get_dummies(categories, prefix=f'{column}', prefix_sep='_')
    train = pd.concat([train, categories], axis=1)
    #train.drop(column, axis=1, inplace=True)

In [None]:
non_numeric_columns = train.drop(columns=['ID']).select_dtypes(exclude=['number']).columns

# Convert non-numerical columns to categorical data type
train[non_numeric_columns] = train[non_numeric_columns].astype('category')

In [None]:
train = pd.get_dummies(train, columns=non_numeric_columns, prefix=non_numeric_columns)

In [None]:
train.to_excel('output.xlsx', index=False, sheet_name='Sheet2')

In [None]:
X = train.drop(['ID', 'Yield'], axis=1)

# Extract the target variable 'Yield'
y = train['Yield']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1234)

# Instantiate the model
model = RandomForestRegressor(random_state=1234)

# Fit the model
model.fit(X_train.fillna(0), y_train)

# Make predictions
preds = model.predict(X_test.fillna(0))

# Measure model performance
rmse = mean_squared_error(y_test, preds, squared=False)
print("Root Mean Squared Error:", rmse)

In [None]:
columns_to_convert = ['NursDetFactor']

# Create dummy variables for the specified columns
for column in columns_to_convert:
    # Split the values in the column by spaces
    categories = train[column].str.split(expand=True)

    # Create dummy variables with NaNs (missing values) preserved
    categories = pd.get_dummies(categories, prefix=f'{column}', prefix_sep='_', dummy_na=True)

    # Concatenate the dummy variables with the original DataFrame
    train = pd.concat([train, categories], axis=1)

    # Drop the original column
    train.drop(column, axis=1, inplace=True)


In [None]:
train.isnull().sum()

In [None]:
train.to_excel('output.xlsx', index=False, sheet_name='Sheet1')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.histplot(data=train, x='TransplantingIrrigationSource', bins=10)  # You can adjust the number of bins as needed
plt.xlabel('Transplanting Irrigation Source')
plt.ylabel('Frequency')
plt.title('Histogram of Transplanting Irrigation Source')
plt.show()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer

imputed_loans_df = imputed_loans_df.copy()

grade_mapping = {'A1': 1, 'A2': 2, 'A3': 3, 'B1': 4, 'B2': 5, 'B3': 6, 'C1': 7, 'C2': 8, 'C3': 9, 'C4': 10, 'C5': 11, 'C6': 12}
imputed_loans_df['display_grade'] = imputed_loans_df['display_grade'].map(grade_mapping)

valid_rows = imputed_loans_df.dropna(subset=['display_grade', 'debt to income ratio'])

# Create X (feature) and y (target) for training the model
X_train = valid_rows[['debt to income ratio']]
y_train = valid_rows['display_grade']

# Initialize and fit a linear regression model
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

# Extract rows with missing 'display_grade' values
missing_grade_rows = imputed_loans_df[imputed_loans_df['display_grade'].isnull()]

# Impute missing 'display_grade' values based on the regression model
if not missing_grade_rows.empty:
   missing_grade_predictions = regression_model.predict(missing_grade_rows[['debt to income ratio']])
   missing_grade_rows['display_grade'] = missing_grade_predictions.round().astype(int)
# Merge the imputed data back with the original dataset

imputed_loans_df.update(missing_grade_rows)
# Print the sum of null values to veri

imputed_loans_df['display_grade'].isnull().sum()


In [None]:
train['MineralFertAppMethod2'] = train['MineralFertAppMethod.1']
train = train.drop(columns='MineralFertAppMethod.1')

In [None]:
columns_to_convert = ['LandPreparationMethod', 'NursDetFactor', 'TransDetFactor', 
, 'OrgFertilizers' ,'CropbasalFerts','FirstTopDressFert' ]

# Create dummy variables for the specified columns
for column in columns_to_convert:
    categories = train[column].str.split(expand=True)
    categories = pd.get_dummies(categories, prefix=f'{column}', prefix_sep='_')
    train = pd.concat([train, categories], axis=1)
    train.drop(column, axis=1, inplace=True)

In [None]:
train.isnull().sum()

In [None]:
column_names = train.columns.tolist()

# Print the list of all column names
print("List of all column names:")
print(column_names)

In [None]:
columns_with_missing_values = train.columns[train.isnull().all()]

# Calculate the number of missing values in each of these columns
missing_value_counts = train[columns_with_missing_values].isnull().sum()

# Combine the column names and their missing value counts into a DataFrame
missing_info = pd.DataFrame({'Column Name': columns_with_missing_values, 'Missing Values Count': missing_value_counts})

# Print the columns and their missing value counts
print(missing_info)

In [None]:
train.head()

In [None]:
df_first_10_columns = train.iloc[:, :25]
df_first_10_columns

In [None]:
# Split data for training and local testing

X = train.drop(['ID', 'Yield'], axis = 1)
X =X.select_dtypes(include=np.number)
y = train.Yield

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1234)

# Instantiate model
model = RandomForestRegressor(random_state = 1234)

# Fit model
model.fit(X_train.fillna(0), y_train)

# Make predictions
preds = model.predict(X_test.fillna(0))

# Measure model performance
mean_squared_error(y_test, preds, squared=False)

In [None]:
# Make predictions on the Zindi test set
test_df = test[X.columns]
preds = model.predict(test_df.fillna(0))

# Create submisiion file to be uploaded to Zindi for scoring
sub = pd.DataFrame({'ID': test.ID, 'Yield': preds})
sub.to_csv('BenchmarkSubmission.csv', index = False)

sub.head()