In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt


In [None]:

# Load the CSV file
train_df = pd.read_csv('train.csv')


In [None]:
# Calculate the percentage of null values in each column
null_percentage = train_df.isnull().mean() * 100

# Sort the columns by the percentage of null values in descending order
sorted_null_percentage = null_percentage.sort_values(ascending=False)

# Print the sorted percentage of null values
for column in sorted_null_percentage.index:
    print(f"{column}: {sorted_null_percentage[column]:.2f}%")

In [None]:
columns_with_high_nulls = [
    'FarmClassification', 'PerimeterGuardPlantsArea', 'UndergroundStorageSqft', 
    'FieldZoneLevel', 'HarvestStorageSqft', 'HasGreenHouse', 
    'CropFieldConfiguration', 'FieldConstructionType', 'CultivatedAndWildArea', 
    'FieldShadeCover', 'ReservoirType', 'TotalReservoirSize', 
    'ReservoirWithFilter', 'HasPestControl', 'TaxOverdueYear', 
    'TaxOverdueStatus', 'FarmShedAreaSqft', 'TotalAreaSqft', 
    'PrimaryCropAreaSqft2', 'PrimaryCropAreaSqft', 'NumberGreenHouses', 
    'PartialIrrigationSystemCount', 'NaturalLakePresence'
]
print(len(columns_with_high_nulls))
print(len(train_df.columns))

# Drop columns with high percentage of null values
train_df = train_df.drop(columns=columns_with_high_nulls)
print(train_df.columns)
print(len(train_df.columns))


In [None]:

#  plt.show()
print(f"Total number of rows: {len(train_df)}")
train_df.head()

In [None]:
# Calculate the threshold for missing values
print(train_df.columns)
median_fill_cols=['AgriculturalPostalZone','AgricultureZoningCode','CropSpeciesVariety',
        'FarmVehicleCount','DistrictId','FarmingCommunityId','FarmingUnitCount',
        'FieldEstablishedYear','HarvestProcessingType','LandUsageType', 'SoilFertilityType', 'StorageAndFacilityCount', 
        'NumberOfFarmingZones','OtherZoningCode','RawLocationId','NationalRegionCode',
        'TownId','TypeOfIrrigationSystem','ValuationYear', 'MainIrrigationSystemCount',
        'WaterAccessPoints', 'WaterAccessPointsCalc', 'WaterReservoirCount'
       ]
mean_fill_cols= [
    'CultivatedAreaSqft1', 'FarmEquipmentArea', 
    'FieldSizeSqft', 'Latitude', 'Longitude',  
    'TaxAgrarianValue', 'TaxLandValue', 'TotalCultivatedAreaSqft', 
    'TotalTaxAssessed', 'TotalValue'
]

# Identify the dropped columns
# Display the dropped columns
target_mapping = {'low': 0, 'medium': 1, 'high': 2}
train_df["Target"] = train_df['Target'].map(target_mapping)
	

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

dropped_columns = ['ReservoirType', 'HasPestControl', 'TotalReservoirSize', 'WaterReservoirCount', 
                   'FarmingCommunityId', 'PrimaryCropAreaSqft', 'FieldConstructionType', 
                   'HarvestStorageSqft', 'NumberGreenHouses', 'PartialIrrigationSystemCount', 
                   'FarmShedAreaSqft', 'PrimaryCropAreaSqft2', 'UndergroundStorageSqft', 
                   'HarvestProcessingType', 'ReservoirWithFilter', 'FarmingUnitCount', 
                   'FarmEquipmentArea', 'SoilFertilityType', 'PerimeterGuardPlantsArea', 
                   'TaxOverdueYear', 'CultivatedAndWildArea', 'NaturalLakePresence', 
                   'FarmVehicleCount', 'FarmClassification', 'TaxOverdueStatus', 'FieldShadeCover', 
                   'FieldZoneLevel', 'OtherZoningCode', 'NumberOfFarmingZones', 'CropFieldConfiguration', 
                   'TypeOfIrrigationSystem', 'HasGreenHouse', 'TotalAreaSqft']

mean_fill_cols = ['AgricultureZoningCode', 'CropSpeciesVariety', 'CultivatedAreaSqft1', 
                  'DistrictId', 'FieldEstablishedYear', 'LandUsageType', 'Latitude', 'Longitude', 
                  'MainIrrigationSystemCount', 'NationalRegionCode', 'RawLocationId', 
                  'StorageAndFacilityCount', 'TaxAgrarianValue', 'TaxLandValue', 'TotalCultivatedAreaSqft', 
                  'TotalTaxAssessed', 'TotalValue', 'ValuationYear', 'WaterAccessPoints', 
                  'WaterAccessPointsCalc']

median_fill_cols = ['FieldSizeSqft', 'TownId']

# Fill missing values with mean for symmetrically distributed data
train_df[mean_fill_cols] = train_df[mean_fill_cols].fillna(train_df[mean_fill_cols].mean())

# Fill missing values with median for skewed data
train_df[median_fill_cols] = train_df[median_fill_cols].fillna(train_df[median_fill_cols].median())

correlation_matrix = train_df.corr()

# Create a heatmap
plt.figure(figsize=(20, 15))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix Heatmap')
#plt.show()


In [None]:
# Sort correlation values with respect to the target variable
sorted_correlation = correlation_matrix["Target"].sort_values().to_frame()

# Identify highly correlated column pairs
highest_correlatedcolumes = [[col1, col2, correlation_matrix.loc[col1, col2]] 
                             for col1 in sorted_correlation.index 
                             for col2 in sorted_correlation.index 
                             if col1 != col2 and (correlation_matrix.loc[col1, col2] > 0.95 or correlation_matrix.loc[col1, col2] < -0.7)]

seen_pairs = set()
for col1, col2, corr in highest_correlatedcolumes:
    if (col2, col1) not in seen_pairs:
        print(f"{col1} and {col2} --->{corr:.2f}")
        seen_pairs.add((col1, col2))

# Drop irrelevant and constant columns
unrelaved_features = ["UID", "NationalRegionCode"]
columns_to_drop = [col for col in train_df.columns if train_df[col].nunique() <= 1]
train_df.drop(columns=columns_to_drop, inplace=True)

print(train_df.columns.shape)
print(columns_to_drop)
#plt.show()


In [None]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

columns_to_drop=['UID', 'NationalRegionCode', 'TaxLandValue', 'WaterAccessPointsCalc','RawLocationId','WaterAccessPoints','CultivatedAreaSqft1']

# Split the data into features and target
train_df.drop(columns=columns_to_drop, inplace=True)
X = train_df.drop(columns=['Target'])


y = train_df['Target']
print(X.shape)

# Drop irrelevant columns if they exist in the DataFra

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.utils import resample
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Define the number of estimators
n_es = 200

# Set up classifier with best parameters
best_params = {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'objective': 'multi:softmax', 'random_state': 42, 'subsample': 0.8}
clf = XGBClassifier(**best_params)

# Separate majority and minority classes
df_majority = train_df[train_df.Target == 1]
df_minority_0 = train_df[train_df.Target == 0]
df_minority_2 = train_df[train_df.Target == 2]

# Upsample minority class 0
df_minority_upsampled_0 = resample(df_minority_0, 
                                   replace=True,     
                                   n_samples=len(df_majority),    
                                   random_state=42)

# Upsample minority class 2
df_minority_upsampled_2 = resample(df_minority_2, 
                                   replace=True,     
                                   n_samples=len(df_majority),    
                                   random_state=42)

# Combine majority class with upsampled minority classes
df_upsampled = pd.concat([df_minority_upsampled_0, df_majority, df_minority_upsampled_2])

# Display new class counts
print(df_upsampled.Target.value_counts())

# Split the data into features and target
X = df_upsampled.drop(columns=['Target'])
y = df_upsampled['Target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the classifier and make predictions
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(report)


In [None]:
import argparse
def make_predictions(test_fname, predictions_fname):
#TODO: complete this function to save predictions to the csv file predictions_fname
#this is an example, you need to modify the code below to fit your workflow
#### start code ####
    test_df = pd.read_csv(test_fname)
    columns_with_high_nulls = [
    'FarmClassification', 'PerimeterGuardPlantsArea', 'UndergroundStorageSqft', 
    'FieldZoneLevel', 'HarvestStorageSqft', 'HasGreenHouse', 
    'CropFieldConfiguration', 'FieldConstructionType', 'CultivatedAndWildArea', 
    'FieldShadeCover', 'ReservoirType', 'TotalReservoirSize', 
    'ReservoirWithFilter', 'HasPestControl', 'TaxOverdueYear', 
    'TaxOverdueStatus', 'FarmShedAreaSqft', 'TotalAreaSqft', 
    'PrimaryCropAreaSqft2', 'PrimaryCropAreaSqft', 'NumberGreenHouses', 
    'PartialIrrigationSystemCount', 'NaturalLakePresence'
    ]
    columns_to_drop=['UID','WaterReservoirCount', 'NationalRegionCode', 'TaxLandValue', 'WaterAccessPointsCalc','RawLocationId','WaterAccessPoints','CultivatedAreaSqft1']
    test_df.drop(columns=columns_with_high_nulls, inplace=True)
    mean_fill_cols = ['AgricultureZoningCode', 'CropSpeciesVariety', 'CultivatedAreaSqft1', 'DistrictId', 'FieldEstablishedYear', 'LandUsageType', 'Latitude', 'Longitude', 'MainIrrigationSystemCount', 'NationalRegionCode', 'RawLocationId', 'StorageAndFacilityCount', 'TaxAgrarianValue', 'TaxLandValue', 'TotalCultivatedAreaSqft', 'TotalTaxAssessed', 'TotalValue', 'ValuationYear', 'WaterAccessPoints', 'WaterAccessPointsCalc']
    median_fill_cols = ['FieldSizeSqft', 'TownId']

    test_df[mean_fill_cols] = test_df[mean_fill_cols].fillna(test_df[mean_fill_cols].mean())

# Fill missing values with median for skewed data
    test_df[median_fill_cols] = test_df[median_fill_cols].fillna(test_df[median_fill_cols].mean())

    test_uid = test_df[["UID"]].copy()

    test_df.drop(columns=columns_to_drop, inplace=True)
    preds = clf.predict(test_df)
    inverse_target_mapping = {0:'low',1:'medium',2:'high'}
    preds = pd.Series(preds).map(inverse_target_mapping)
    # preds=preds.flatten()
    # preds = preds.map(target_mapping)
    test_uid["Target"] = preds
    test_uid.to_csv(predictions_fname, index=False)
#### end code ####
import argparse

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--train-file", type=str, help="Path to train.csv")
    parser.add_argument("--test-file", type=str, help="Path to test.csv")
    parser.add_argument("--predictions-file", type=str, help="Save path for predictions")
    args = parser.parse_args()

    make_predictions(args.test_file, args.predictions_file)
