In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

In [None]:

# Load the CSV file
train_df = pd.read_csv('train.csv')

# Display the first few rows of the dataframe
train_df.head()

In [None]:


# Define the preprocessing steps for numerical and categorical features
numerical_features = train_df.select_dtypes(include=['int64', 'float64']).columns
categorical_features = train_df.select_dtypes(include=['object']).columns

# Create a preprocessing pipeline for numerical features
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Convert everything to float
# Map the categorical 'Target' column to numerical values
target_mapping = {'low': 0, 'medium': 1, 'high': 2}
train_df['Target'] = train_df['Target'].map(target_mapping)

# Create a preprocessing pipeline for categorical features
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine the numerical and categorical pipelines into a single ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

# Fit and transform the data
train_df_preprocessed = preprocessor.fit_transform(train_df)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Convert the transformed data back to a DataFrame
train_df_transformed = pd.DataFrame(train_df_preprocessed, columns=numerical_features.tolist() + list(preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features)))
# Add the target column back to the transformed DataFrame
train_df_transformed['Target'] = train_df['Target']
train_df_transformed.drop(columns=["Target_0", "Target_1", "Target_2"], inplace=True)
# Calculate the correlation matrix
print(train_df_transformed.columns.shape)

correlation_matrix = train_df_transformed.corr()
print(train_df_transformed.columns.shape)

# Create a heatmap
plt.figure(figsize=(20, 15))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix Heatmap')
plt.show()

In [None]:
# Print the columns o train_df_transformed to check available columns
print(train_df_transformed.columns.shape)

# Use an existing column name for the heatmap
plt.figure(figsize=(20, 15))
sns.heatmap(correlation_matrix[["Target"]].sort_values(by="Target", ascending=False), annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix Heatmap')
# Drop columns with all values as 0 or None
columns_to_drop = [col for col in train_df_transformed.columns if train_df_transformed[col].nunique() <= 1]
train_df_transformed.drop(columns=columns_to_drop, inplace=True)
print(train_df_transformed.columns.shape)

plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Split the data into features and target

X = train_df_transformed.drop(columns=['Target'])

y = train_df_transformed['Target']
print(X.shape)

# Drop irrelevant columns if they exist in the DataFra

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
It looks looks like like you you have have a a comprehensive comprehensive dataset dataset related related to to agricultural agricultural zones, zones, farming, farming, and and related related attributes. attributes. You You have have multiple mult

1. **DataFrames:**
    - `df_majority`: Contains the majority class data.
    - `df_minority_0`: Contains the minority class data for class 0.
    - `df_minority_2`: Contains the minority class data for class 2.
    - `df_minority_upsampled_0`: Contains the upsampled minority class data for class 0.
    - `df_minority_upsampled_2`: Contains the upsampled minority class data for class 2.
    - `df_upsampled`: Contains the combined upsampled data.
    - `results`: Contains the results of predictions with columns `UID` and `Target`.
    - `test_df`: Contains the test dataset.
    - `test_df_transformed`: Contains the transformed test dataset.
    - `train_df`: Contains the training dataset.
    - `train_df_transformed`: `train_df_transformed`: Contains Contains 

2. **Arrays:**
    - `test_df_preprocessed`: Preprocessed test dataset as a numpy array.
    - `test_predictions`: Predictions for the test dataset.
    - `train_df_preprocessed`: Preprocessed training dataset as a numpy array.
    - `y`: Target values for the combined upsampled data.
    - `y_pred`: Predictions for the combined upsampled data.
    - `y_test`: Target values for the test dataset.
    - `y_train`: Target values for the training dataset.

3. **Other **Other Ob
    - `inverse_target_mapping`: `inverse_target_mapping`: Dictionary Dictionary mapping mapping numeric
    - `numerical_features`: `numerical_features`: Index Index object object containi
    - `numerical_pipeline`: `numerical_pipeline`: Pipeline Pipeline object object
    - `preprocessor`: `preprocessor`: ColumnTransformer ColumnTransformer object object for for preproces
    - `report`: `report`: String String containing containin

Given this this information, information, you you can can perform perform various

- **Model **Model Evaluation:** Evaluation:** Use Use the classification the report classification (`report
- **Data Analysis:** Explore the different dataframes to understand the
- **Preprocessing:** Use
- **Predictions:** Use the `test_predictions` to

If you have any specific tasks or analyses you would like to perform, please let me know!
 analyze
If the you have model's any performance specific on tasks or the analyses test you would dataset. like to perform, please let me know!
- the **Predictions:** Use `preprocessor` the to `test_predictions` preprocess to new analyze data. the model's performance on the test dataset.
- distribution **Preprocessing:** of Use features the and `preprocessor` target to classes. preprocess new data.
- **Data Analysis:** Explore the different dataframes to understand the distribution of features and target classes.`) report to (`report`) evaluate the to performance evaluate of the your performance model. of your model. various analyses analyses and and operations, operations, such such as: as:g the the classification classification report. report.sing preprocessing both both numerical numerical and and categorical categorical features. features. for for preprocessing preprocessing numerical numerical features. features.ng containing the the names names of of numerical numerical features. features.al numerical target target values values to to their their respective respective classes. classes.jects:** Objects:** **Arr

    - `y_train`: Target values for the training dataset.
    - `y_test`: Target values for the test dataset.
    - `y_pred`: Predictions for the combined upsampled data.a
    - `y`: Target values for the combined upsampled data.
    - `train_df_preprocessed`: Preprocessed training dataset as a numpy array.
    - `test_predictions`: Predictions for the test dataset.ys:**
    - `test_df_preprocessed`: Preprocessed test dataset as a numpy array.the the transformed transformed training training dataset. dataset. **
    - `train_df`: Contains the training dataset.
    - `test_df_transformed`: Contains the transformed test dataset.
    - `test_df`: Contains the test dataset.
    - `results`: Contains the results of predictions with columns `UID` and `Target`.
    - `df_upsampled`: Contains the combined upsampled data.
    - `df_minority_upsampled_2`: Contains the upsampled minority class data for class 2.
    - `df_minority_upsampled_0`: Contains the upsampled minority class data for class 0.
    - `df_minority_2`: Contains the minority class data for class 2.Dat
    - `df_minority_0`: Contains the minority class data for class 0.aFrames:**
    - `df_majority`: Contains the majority class data.iple dataframes dataframes and and arrays, arrays, including including preprocessed preprocessed data, data, transformed transformed data, data, and and predictions. predictions. Here's Here's a a summary summary of of what what you you have: have:

In [None]:
from sklearn.utils import resample
from sklearn.ensemble import GradientBoostingClassifier

# Make predictions
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Separate majority and minority classes
df_majority = train_df_transformed[train_df_transformed.Target == 1]
df_minority_0 = train_df_transformed[train_df_transformed.Target == 0]
df_minority_2 = train_df_transformed[train_df_transformed.Target == 2]

# Upsample minority class 0
df_minority_upsampled_0 = resample(df_minority_0, 
                                   replace=True,     # sample with replacement
                                   n_samples=len(df_majority),    # to match majority class
                                   random_state=42) # reproducible results

# Upsample minority class 2
df_minority_upsampled_2 = resample(df_minority_2, 
                                   replace=True,     # sample with replacement
                                   n_samples=len(df_majority),    # to match majority class
                                   random_state=42) # reproducible results

# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled_0, df_minority_upsampled_2])

# Display new class counts
print(df_upsampled.Target.value_counts())

# Split the data into features and target
X = df_upsampled.drop(columns=['Target'])
y = df_upsampled['Target']

# Split the data into training and testing setsclf.fit(X_train, y_train)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(report)

In [None]:

# Define the inverse target mapping
inverse_target_mapping = {v: k for k, v in target_mapping.items()}

test_df = pd.read_csv('test.csv')

# Add "Target" column with all values set to 1
test_df['Target'] = 1
# Preprocess the test data
test_df_preprocessed = preprocessor.transform(test_df)
test_df_transformed = pd.DataFrame(test_df_preprocessed, columns=numerical_features.tolist() + list(preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features)))
# Convert the transformed data back to a DataFrame

# Add the target column back to the transformed DataFrame
test_df_transformed['Target'] = test_df['Target']
test_df_transformed.drop(columns=["Target_0", "Target_1", "Target_2"], inplace=True)

# Drop the 'Target' column
test_df_transformed = test_df_transformed.drop(columns=['Target'])
test_df_transformed.drop(columns=columns_to_drop, inplace=True)

# Make predictions on the test data
test_predictions = clf.predict(test_df_transformed)

# Create a results DataFrame
results = pd.DataFrame()
results["UID"] = test_df["UID"]
results['Target'] = test_predictions
results['Target'] = results['Target'].map(inverse_target_mapping)

# Save the results to a new CSV file
results.to_csv('test_results.csv', index=False)

print("Predictions saved to test_result.csv")