In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv("/kaggle/input/summer-analytics-mid-hackathon/hacktrain.csv")

In [None]:
df.isnull().sum()

In [None]:
plt.figure(figsize=(12,6))
sns.heatmap(df.isnull(), cmap='Blues', cbar=False, yticklabels=False, xticklabels=df.columns);
plt.show()

In [None]:
df.fillna(df.mean(numeric_only=True), inplace=True) 

In [None]:
plt.figure(figsize=(12,6))
sns.heatmap(df.isnull(), cmap='Blues', cbar=False, yticklabels=False, xticklabels=df.columns);
plt.show()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import classification_report
from scipy.ndimage import uniform_filter1d
# from imblearn.over_sampling import SMOTE

df.drop(columns=[col for col in ['Unnamed: 0', 'ID'] if col in df.columns], inplace=True)

label_encoder = LabelEncoder()
df['class'] = label_encoder.fit_transform(df['class'])

ndvi_cols = [col for col in df.columns if col != 'class']

df[ndvi_cols] = df[ndvi_cols].apply(lambda x: uniform_filter1d(x, size=9), axis=0)

imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(df[ndvi_cols])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

selector = VarianceThreshold(threshold=0.01)
X_sel = selector.fit_transform(X_scaled)

# poly = PolynomialFeatures(degree=1, interaction_only=True, include_bias=False)
# X_poly = poly.fit_transform(X_sel)
y = df['class']

# smote = SMOTE(random_state=42)
# X_resampled, y_resampled = smote.fit_resample(X_sel, y)
custom_weights = {
    0: 1.0,   # farm (baseline)
    1: 0.5,   # forest 
    2: 3.5,   # grass (reduced from extreme)
    3: 1.5,   # impervious
    4: 8.5,   # orchard (much lower than 44.4)
    5: 4.0    # water
}
model = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    max_iter=500,
    class_weight=custom_weights,
    C=0.05
)
model.fit(X_sel, y)
# model.fit(X_resampled, y_resampled)

y_pred = model.predict(X_sel)
print(classification_report(y, y_pred, target_names=label_encoder.classes_))

In [None]:
# from sklearn.model_selection import GridSearchCV
# # Grid of values to search
# param_grid = {
#     'C': [0.01, 0.1, 0.5, 1, 2, 5, 10, 50, 100]
# }

# # Setup GridSearchCV
# grid_search = GridSearchCV(
#     estimator=model,
#     param_grid=param_grid,
#     scoring='f1_macro',  # or 'accuracy', 'f1_weighted'
#     cv=5,
#     verbose=2,
#     n_jobs=-1
# )

# # Run the grid search
# grid_search.fit(X_sel, y)

# # Best result
# print("Best C value:", grid_search.best_params_)
# print("Best F1 Macro Score:", grid_search.best_score_)

In [None]:
test_df = pd.read_csv("/kaggle/input/summer-analytics-mid-hackathon/hacktest.csv")
test_ids = test_df.get("ID")  
test_df.drop(columns=[col for col in ['Unnamed: 0', 'ID']], inplace=True)

test_df[ndvi_cols] = test_df[ndvi_cols].apply(lambda x: uniform_filter1d(x, size=9), axis=0)

X_test_imputed = imputer.transform(test_df[ndvi_cols])

X_test_scaled = scaler.transform(X_test_imputed)

X_test_sel = selector.transform(X_test_scaled)

y_test_pred = model.predict(X_test_sel)
y_test_labels = label_encoder.inverse_transform(y_test_pred)


In [None]:
output_df = pd.DataFrame({
    'ID': test_ids,
    'Predicted_Class': y_test_labels
})

print(output_df)  
output_df.to_csv("submission_v32.csv", index=False)

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder

# Load submission file
df = pd.read_csv("/kaggle/working/submission_v32.csv")

# If you have the original features used for prediction, load them
features_df = pd.read_csv("/kaggle/input/summer-analytics-mid-hackathon/hacktest.csv")
features_df.drop(columns=[col for col in ['Unnamed: 0', 'ID'] if col in features_df.columns], inplace=True)

# Merge predictions
df = df.join(features_df)

# Encode predicted labels
label_encoder = LabelEncoder()
df['Predicted_Code'] = label_encoder.fit_transform(df['Predicted_Class'])

# Apply PCA to reduce NDVI features to 2D
ndvi_cols = [col for col in df.columns if col not in ['ID', 'Predicted_Class', 'Predicted_Code']]
pca = PCA(n_components=2)
X_pca = pca.fit_transform(df[ndvi_cols])

# Create a DataFrame with PCA components and class
pca_df = pd.DataFrame({
    'PCA1': X_pca[:, 0],
    'PCA2': X_pca[:, 1],
    'Predicted_Class': df['Predicted_Class']
})

# Plot the scatter
plt.figure(figsize=(10, 7))
sns.scatterplot(data=pca_df, x='PCA1', y='PCA2', hue='Predicted_Class', palette='tab10', alpha=0.7, s=50)
plt.grid(True)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd

# Load the CSV files
df1 = pd.read_csv("submission_v30.csv")
df2 = pd.read_csv("submission_v29.csv")

# Find rows where any column differs
differences = df1 != df2
diff_rows = differences.any(axis=1)

# Show only differing rows from both files side-by-side
comparison = pd.DataFrame({
    "Index": df1.index[diff_rows],
    "submission_v30": df1[diff_rows].iloc[:, -1].values,
    "submission_v29": df2[diff_rows].iloc[:, -1].values
})

# Reset index for readability
comparison.reset_index(drop=True, inplace=True)

print(comparison)