<a href="https://colab.research.google.com/github/prathikprajapati/Summer-Analytics-Prathik/blob/main/KAGGLE_hackathon_File.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas numpy scikit-learn tabulate lightgbm xgboost catboost
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_sample_weight
import warnings
warnings.filterwarnings("ignore")

# Load data
train = pd.read_csv('hacktrain.csv')
test = pd.read_csv('hacktest.csv')
test_id = test['ID']

# Encode target
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(train['class'])

# Drop ID and target
X = train.drop(['ID', 'class'], axis=1)
X_test = test.drop(['ID'], axis=1)

# Imputation
imputer = SimpleImputer(strategy='median')
X = imputer.fit_transform(X)
X_test = imputer.transform(X_test)

# Feature Engineering
def engineer(X):
    df = pd.DataFrame(X)
    df['mean'] = df.mean(axis=1)
    df['std'] = df.std(axis=1)
    df['min'] = df.min(axis=1)
    df['max'] = df.max(axis=1)
    df['range'] = df['max'] - df['min']
    df['skew'] = df.skew(axis=1)
    return df.values

X = engineer(X)
X_test = engineer(X_test)

# Train/Val Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
sample_weight = compute_sample_weight('balanced', y_train)

# Models
lgb = LGBMClassifier(n_estimators=500, learning_rate=0.03, num_leaves=60, class_weight='balanced', random_state=42)
xgb = XGBClassifier(n_estimators=500, learning_rate=0.03, max_depth=6, scale_pos_weight=1, random_state=42, use_label_encoder=False, eval_metric='mlogloss')
cat = CatBoostClassifier(n_estimators=500, learning_rate=0.03, depth=6, verbose=0, random_state=42)

# Voting Ensemble
ensemble = VotingClassifier(
    estimators=[('lgb', lgb), ('xgb', xgb), ('cat', cat)],
    voting='soft'  # use predicted probabilities
)

# Train
ensemble.fit(X_train, y_train, sample_weight=sample_weight)

# Validate
y_pred = ensemble.predict(X_val)
acc = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred, average='macro')

print(f"✅ Accuracy: {acc:.4f}")
print(f"✅ Macro F1 Score: {f1:.4f}")

# Final Prediction
final_preds = ensemble.predict(X_test)
decoded_preds = label_encoder.inverse_transform(final_preds)

# Save Submission
submission = pd.DataFrame({'ID': test_id, 'class': decoded_preds})
submission.to_csv("submission.csv", index=False)
print("✅ Submission file created successfully!")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

train['source'] = 'train'
test['source'] = 'test'
combined = pd.concat([train.drop('class', axis=1), test], axis=0)

for col in train.columns.drop(['ID', 'class']):
    plt.figure(figsize=(6,3))
    sns.kdeplot(data=combined, x=col, hue='source')
    plt.title(f'Distribution of {col} - Train vs Test')
    plt.show()

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.utils.class_weight import compute_sample_weight
from scipy.stats import randint, uniform, ks_2samp
from lightgbm import LGBMClassifier
import warnings
from lightgbm import early_stopping, log_evaluation
import os

os.environ['LOKY_MAX_CPU_COUNT'] = '6'
warnings.filterwarnings("ignore", category=UserWarning, module="lightgbm")

# ===================== 🔹 Load Data =====================
df_train = pd.read_csv("hacktrain.csv")
df_test = pd.read_csv("hacktest.csv")

# Backup ID for final submission
ID = df_test['ID']

# ===================== 🔹 Target Encode =====================
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(df_train['class'])

X_train_df = df_train.drop(['ID', 'class'], axis=1)
X_test_df = df_test.drop(['ID'], axis=1)

# ===================== 🔹 Detect and Drop Shifted Features =====================
X_train_df['__source'] = 0
X_test_df['__source'] = 1
combined = pd.concat([X_train_df, X_test_df], axis=0)

shifted_features = []
for col in X_train_df.columns:
    if col == '__source': continue
    stat, pval = ks_2samp(
        combined[combined['__source'] == 0][col],
        combined[combined['__source'] == 1][col]
    )
    if pval < 0.01:
        shifted_features.append(col)

print(f"⚠️ Removing shifted features: {shifted_features}")
X_train_df.drop(columns=shifted_features + ['__source'], inplace=True)
X_test_df.drop(columns=shifted_features + ['__source'], errors='ignore', inplace=True)

# ===================== 🔹 Imputation =====================
imputer = SimpleImputer(strategy='median')
X_train_imputed = imputer.fit_transform(X_train_df)
X_test_imputed = imputer.transform(X_test_df)

# ===================== 🔹 Feature Engineering =====================
def add_stat_features(X):
    df = pd.DataFrame(X)
    df['mean'] = df.mean(axis=1)
    df['std'] = df.std(axis=1)
    df['min'] = df.min(axis=1)
    df['max'] = df.max(axis=1)
    df['range'] = df['max'] - df['min']
    df['skew'] = df.skew(axis=1)
    return df.values

X_train_final = add_stat_features(X_train_imputed)
X_test_final = add_stat_features(X_test_imputed)

# ===================== 🔹 Polynomial Features =====================
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
X_train_final = poly.fit_transform(X_train_final)
X_test_final = poly.transform(X_test_final)

# ===================== 🔹 PCA =====================
pca = PCA(n_components=0.95)
X_train_final = pca.fit_transform(X_train_final)
X_test_final = pca.transform(X_test_final)

# ===================== 🔹 Train/Val Split =====================
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train_final, y_train, test_size=0.2, stratify=y_train, random_state=42
)

# ===================== 🔹 Sample Weights =====================
sample_weight = compute_sample_weight('balanced', y_train_split)

# ===================== 🔹 LGBM Tuning =====================
model = LGBMClassifier(objective='multiclass', num_class=len(np.unique(y_train)), boosting_type='gbdt', random_state=42, verbose=-1)

param_dist = {
    'num_leaves': randint(30, 150),
    'max_depth': [5, 7, 15],
    'learning_rate': uniform(0.01, 0.1),
    'n_estimators': randint(300, 800),
    'subsample': uniform(0.6, 0.3),
    'colsample_bytree': uniform(0.6, 0.3),
    'reg_alpha': uniform(0.01, 0.5),
    'reg_lambda': uniform(0.01, 0.5),
    'min_child_samples': randint(5, 25),
    'subsample_freq': [0, 1, 2]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    n_iter=30,
    scoring='f1_macro',
    cv=cv,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

print("🔍 Starting hyperparameter tuning...")
random_search.fit(X_train_split, y_train_split, sample_weight=sample_weight)

# ===================== 🔹 Final Training =====================
best_model = random_search.best_estimator_
best_model.set_params(n_estimators=500, learning_rate=0.02)

best_model.fit(
    X_train_split, y_train_split,
    sample_weight=sample_weight,
    eval_set=[(X_val_split, y_val_split)],
    eval_metric="multi_logloss",
    callbacks=[early_stopping(stopping_rounds=50), log_evaluation(50)]
)

# ===================== 🔹 Prediction & Submission =====================
y_pred = best_model.predict(X_test_final)
y_decoded = label_encoder.inverse_transform(y_pred)

submission = pd.DataFrame({'ID': ID, 'class': y_decoded})
submission.to_csv("submission.csv", index=False)

print("✅ Submission file created successfully!")

⚠️ Removing shifted features: ['Unnamed: 0']
🔍 Starting hyperparameter tuning...
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Training until validation scores don't improve for 50 rounds
[50]	valid_0's multi_logloss: 0.738967
[100]	valid_0's multi_logloss: 0.459056
[150]	valid_0's multi_logloss: 0.341589
[200]	valid_0's multi_logloss: 0.285664
[250]	valid_0's multi_logloss: 0.255238
[300]	valid_0's multi_logloss: 0.23775
[350]	valid_0's multi_logloss: 0.226354
[400]	valid_0's multi_logloss: 0.219594
[450]	valid_0's multi_logloss: 0.214511
[500]	valid_0's multi_logloss: 0.211553
Did not meet early stopping. Best iteration is:
[498]	valid_0's multi_logloss: 0.211526
✅ Submission file created successfully!


In [4]:
import numpy as np
import pandas as pd
import os
import warnings

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, PolynomialFeatures, StandardScaler
from sklearn.decomposition import PCA
from sklearn.utils.class_weight import compute_sample_weight
from scipy.stats import randint, uniform, ks_2samp
from sklearn.cluster import KMeans
from lightgbm import LGBMClassifier, early_stopping, log_evaluation

# ===================== 🔹 Setup =====================
os.environ['LOKY_MAX_CPU_COUNT'] = '6'
warnings.filterwarnings("ignore", category=UserWarning, module="lightgbm")

# ===================== 🔹 Load Data =====================
df_train = pd.read_csv("hacktrain.csv")
df_test = pd.read_csv("hacktest.csv")
ID = df_test['ID']

# ===================== 🔹 Target Encoding =====================
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(df_train['class'])

X_train_df = df_train.drop(['ID', 'class'], axis=1)
X_test_df = df_test.drop(['ID'], axis=1)

# ===================== 🔹 Detect & Drop Shifted Features =====================
X_train_df['__source'] = 0
X_test_df['__source'] = 1
combined = pd.concat([X_train_df, X_test_df], axis=0)

shifted_features = []
for col in X_train_df.columns:
    if col == '__source':
        continue
    stat, pval = ks_2samp(
        combined[combined['__source'] == 0][col],
        combined[combined['__source'] == 1][col]
    )
    if pval < 0.01:
        shifted_features.append(col)

print(f"⚠️ Removing shifted features: {shifted_features}")
X_train_df.drop(columns=shifted_features + ['__source'], inplace=True)
X_test_df.drop(columns=shifted_features + ['__source'], errors='ignore', inplace=True)

# ===================== 🔹 Imputation =====================
imputer = SimpleImputer(strategy='median')
X_train_imputed = imputer.fit_transform(X_train_df)
X_test_imputed = imputer.transform(X_test_df)

# ===================== 🔹 Add Cluster Feature =====================
kmeans = KMeans(n_clusters=5, random_state=42)
X_train_cluster = kmeans.fit_predict(X_train_imputed)
X_test_cluster = kmeans.predict(X_test_imputed)

# ===================== 🔹 Feature Engineering =====================
def add_stat_features(X, cluster):
    df = pd.DataFrame(X)
    df['mean'] = df.mean(axis=1)
    df['std'] = df.std(axis=1)
    df['min'] = df.min(axis=1)
    df['max'] = df.max(axis=1)
    df['range'] = df['max'] - df['min']
    df['skew'] = df.skew(axis=1)
    df['cluster'] = cluster
    return df.values

X_train_final = add_stat_features(X_train_imputed, X_train_cluster)
X_test_final = add_stat_features(X_test_imputed, X_test_cluster)

# ===================== 🔹 Polynomial Features =====================
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
X_train_final = poly.fit_transform(X_train_final)
X_test_final = poly.transform(X_test_final)

# ===================== 🔹 Scale Before PCA =====================
scaler = StandardScaler()
X_train_final = scaler.fit_transform(X_train_final)
X_test_final = scaler.transform(X_test_final)

# ===================== 🔹 PCA =====================
pca = PCA(n_components=0.99)
X_train_final = pca.fit_transform(X_train_final)
X_test_final = pca.transform(X_test_final)

# ===================== 🔹 Split =====================
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train_final, y_train, test_size=0.2, stratify=y_train, random_state=42
)

# ===================== 🔹 Sample Weight =====================
sample_weight = compute_sample_weight('balanced', y_train_split)

# ===================== 🔹 Model & Tuning =====================
model = LGBMClassifier(objective='multiclass',
                       num_class=len(np.unique(y_train)),
                       boosting_type='gbdt',
                       random_state=42, verbose=-1)

param_dist = {
    'num_leaves': randint(30, 150),
    'max_depth': [5, 7, 15],
    'learning_rate': uniform(0.01, 0.1),
    'n_estimators': randint(300, 800),
    'subsample': uniform(0.6, 0.3),
    'colsample_bytree': uniform(0.6, 0.3),
    'reg_alpha': uniform(0.01, 0.5),
    'reg_lambda': uniform(0.01, 0.5),
    'min_child_samples': randint(5, 25),
    'subsample_freq': [0, 1, 2]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    n_iter=10,
    scoring='f1_macro',
    cv=cv,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

print("🔍 Starting hyperparameter tuning...")
random_search.fit(X_train_split, y_train_split, sample_weight=sample_weight)

# ===================== 🔹 Final Model Training =====================
best_model = random_search.best_estimator_
best_model.set_params(n_estimators=500, learning_rate=0.02)

best_model.fit(
    X_train_split, y_train_split,
    sample_weight=sample_weight,
    eval_set=[(X_val_split, y_val_split)],
    eval_metric="multi_logloss",
    callbacks=[early_stopping(stopping_rounds=50), log_evaluation(50)]
)

# ===================== 🔹 Final Prediction =====================
y_pred = best_model.predict(X_test_final)
y_decoded = label_encoder.inverse_transform(y_pred)

submission = pd.DataFrame({'ID': ID, 'class': y_decoded})
submission.to_csv("submission.csv", index=False)

print("✅ Submission file created successfully!")

FileNotFoundError: [Errno 2] No such file or directory: 'hacktrain.csv'