<a href="https://colab.research.google.com/github/samx178/Diabetes-Predictor-AIML_model/blob/main/ESE_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder


In [None]:
TRAIN_PATH = "train.csv"
TEST_PATH = "test.csv"

TARGET_COL = "quality_grade"   # üëà kal bas ye badalna


In [None]:
train_df = pd.read_csv("/kaggle/input/mle-ese-mock/train (5).csv")
test_df = pd.read_csv("/kaggle/input/mle-ese-mock/test (4).csv")

# Target me NaN rows hatao (mandatory)
train_df = train_df.dropna(subset=[TARGET_COL])


In [None]:
X = train_df.drop(columns=[TARGET_COL])
y = train_df[TARGET_COL]


In [None]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


In [None]:
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()


In [None]:
#PIPELINE 1 & 2 (RF + XGB)
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, numeric_cols),
    ("cat", categorical_pipeline, categorical_cols)
])


In [None]:
#pipeline 1 random forest
rf_pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("model", RandomForestClassifier(
        n_estimators=400,
        random_state=42,
        n_jobs=-1
    ))
])


In [None]:
#pipeline XGboost
xgb_pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("model", XGBClassifier(
        n_estimators=500,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="mlogloss",
        random_state=42
    ))
])


In [None]:
X_cb = X.copy()
test_cb = test_df.copy()

for col in categorical_cols:
    X_cb[col] = X_cb[col].fillna("missing")
    test_cb[col] = test_cb[col].fillna("missing")

for col in numeric_cols:
    median = X_cb[col].median()
    X_cb[col] = X_cb[col].fillna(median)
    test_cb[col] = test_cb[col].fillna(median)


In [None]:
cat_features = [X_cb.columns.get_loc(col) for col in categorical_cols]


In [None]:
cat_pipeline = CatBoostClassifier(
    iterations=500,
    depth=6,
    learning_rate=0.05,
    loss_function="MultiClass",
    verbose=0,
    random_state=42
)


In [None]:
# CatBoost log loss best hota hai
# XGB accuracy best hoti hai
MODEL_TYPE = "catboost"
# MODEL_TYPE = "rf"
# MODEL_TYPE = "gb"
# MODEL_TYPE = "xgb"


In [None]:
if MODEL_TYPE == "catboost":
    X_train, X_val, y_train, y_val = train_test_split(
        X_cb, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
    )

    cat_pipeline.fit(X_train, y_train, cat_features=cat_features)

    val_preds = cat_pipeline.predict(X_val)
    val_proba = cat_pipeline.predict_proba(X_val)

    print("Validation Accuracy:", accuracy_score(y_val, val_preds))
    print("Validation Log Loss:", log_loss(y_val, val_proba))

    cat_pipeline.fit(X_cb, y_encoded, cat_features=cat_features)
    test_preds = cat_pipeline.predict(test_cb)

elif MODEL_TYPE == "rf":
    X_train, X_val, y_train, y_val = train_test_split(
        X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
    )

    rf_pipeline.fit(X_train, y_train)

    val_preds = rf_pipeline.predict(X_val)
    val_proba = rf_pipeline.predict_proba(X_val)

    print("Validation Accuracy:", accuracy_score(y_val, val_preds))
    print("Validation Log Loss:", log_loss(y_val, val_proba))

    rf_pipeline.fit(X, y_encoded)
    test_preds = rf_pipeline.predict(test_df)

elif MODEL_TYPE == "gb":
    X_train, X_val, y_train, y_val = train_test_split(
        X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
    )

    gb_pipeline.fit(X_train, y_train)

    val_preds = gb_pipeline.predict(X_val)
    val_proba = gb_pipeline.predict_proba(X_val)

    print("Validation Accuracy:", accuracy_score(y_val, val_preds))
    print("Validation Log Loss:", log_loss(y_val, val_proba))

    gb_pipeline.fit(X, y_encoded)
    test_preds = gb_pipeline.predict(test_df)

elif MODEL_TYPE == "xgb":
    X_train, X_val, y_train, y_val = train_test_split(
        X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
    )

    xgb_pipeline.fit(X_train, y_train)

    val_preds = xgb_pipeline.predict(X_val)
    val_proba = xgb_pipeline.predict_proba(X_val)

    print("Validation Accuracy:", accuracy_score(y_val, val_preds))
    print("Validation Log Loss:", log_loss(y_val, val_proba))

    xgb_pipeline.fit(X, y_encoded)
    test_preds = xgb_pipeline.predict(test_df)


In [None]:
from sklearn.metrics import log_loss, r2_score


In [None]:
cat_pipeline.fit(X_train, y_train, cat_features=cat_features)

# Predictions
val_preds = cat_pipeline.predict(X_val)
val_proba = cat_pipeline.predict_proba(X_val)

print("Validation Accuracy:", accuracy_score(y_val, val_preds))
print("Validation Log Loss:", log_loss(y_val, val_proba))


In [None]:
sample_sub = pd.read_csv("/kaggle/input/mle-ese-mock/submission (6).csv")
print(sample_sub.columns)


In [None]:
test_proba = cat_pipeline.predict_proba(test_cb)


In [None]:
submission = pd.DataFrame()

# id column
submission["id"] = sample_sub["id"]

# class columns with Status_ prefix
class_columns = sample_sub.columns[1:]   # Q1_..., Q2_...

for i, col in enumerate(class_columns):
    submission[f"Status_{col}"] = test_proba[:, i]


In [None]:
submission.to_csv("submission5.csv", index=False)


In [None]:
print(submission.head())
print(submission.columns)


In [None]:
import pandas as pd

submission_new = pd.read_csv("submission4.csv")

print(submission_new)


In [None]:
print(submission_new.head())


In [None]:
print(submission_new.shape)


In [None]:
print(submission_new.columns)


In [None]:
#CSV trick 1

In [None]:
# sample_sub = pd.read_csv("submission (6).csv")


In [None]:
# print(sample_sub.columns)


In [None]:
# test_proba = model.predict_proba(test_data)


In [None]:
# submission = sample_sub.copy()

# # id column exactly as sample
# submission.iloc[:, 0] = sample_sub.iloc[:, 0]

# # baaki columns order-wise fill
# for i in range(1, submission.shape[1]):
#     submission.iloc[:, i] = test_proba[:, i-1]

# submission.to_csv("final_submission.csv", index=False)


In [None]:
# print(submission.head())
# print(submission.columns)
# print(submission.shape)


In [None]:
#trick 2

In [None]:
# import pandas as pd
# import numpy as np

# def build_submission_csv(
#     sample_submission_path,
#     test_proba,
#     output_filename="submission.csv"
# ):
#     """
#     Universal Kaggle submission builder.
#     Works for any dataset as long as sample submission is correct.
#     """

#     # 1. Read sample submission (single source of truth)
#     sample_sub = pd.read_csv(sample_submission_path)

#     # 2. Basic validation
#     if test_proba.shape[1] != (sample_sub.shape[1] - 1):
#         raise ValueError(
#             f"Class count mismatch: "
#             f"model={test_proba.shape[1]}, "
#             f"sample_submission={sample_sub.shape[1] - 1}"
#         )

#     # 3. Clone sample submission
#     submission = sample_sub.copy()

#     # 4. Keep id column exactly same
#     submission.iloc[:, 0] = sample_sub.iloc[:, 0]

#     # 5. Fill all remaining columns order-wise
#     for i in range(1, submission.shape[1]):
#         submission.iloc[:, i] = test_proba[:, i - 1]

#     # 6. Save CSV
#     submission.to_csv(output_filename, index=False)

#     print(f"‚úÖ Submission saved as: {output_filename}")
#     print("üìå Columns used:", list(submission.columns))

#     return submission


In [None]:
# test_proba = cat_model.predict_proba(test_cb)


In [None]:
# submission = build_submission_csv(
#     sample_submission_path="/kaggle/input/mle-ese-mock/submission (6).csv",
#     test_proba=test_proba,
#     output_filename="submission4.csv"
# )


In [None]:
#raj ka code

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score,
    log_loss,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix
)
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# ==========================================
# GLOBAL WARNING CONTROL (SAFE & CLEAN)
# ==========================================
import warnings

# 1Ô∏è‚É£ Ignore known, harmless FutureWarnings (seaborn / pandas)
warnings.filterwarnings(
    "ignore",
    category=FutureWarning
)

# 2Ô∏è‚É£ Ignore pandas RuntimeWarnings from NaN comparisons
warnings.filterwarnings(
    "ignore",
    category=RuntimeWarning,
    module="pandas"
)

# 3Ô∏è‚É£ Ignore seaborn warnings (visualization only)
warnings.filterwarnings(
    "ignore",
    module="seaborn"
)

# 4Ô∏è‚É£ Safety: ensure numpy doesn't spam invalid comparisons
np.seterr(invalid='ignore')

In [None]:
train=pd.read_csv("/kaggle/input/final-everything/train.csv")
test=pd.read_csv("/kaggle/input/final-everything/test.csv")

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
train = train.dropna(subset=['yaha pe targeted column kar de lawde']) #output label

In [None]:
test_id=test['id']
test=test.drop(columns=['id'])

In [None]:
X=train.drop(columns=['target column'])
y=train['target column']

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)

In [None]:
numeric_features=X.select_dtypes(include=['int64','float64']).columns
categorical_features=X.select_dtypes(include=['object','category']).columns

In [None]:
numerical_pipeline=Pipeline(steps=[
    ('impute',SimpleImputer(strategy='mean')),
    ('scaler',StandardScaler())
])
categorical_pipeline=Pipeline(steps=[
    ('impute',SimpleImputer(strategy='most_frequent')),
    ('encode',OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
#VISUALISAION
# STEP 1: HISTPLOT
# ==========================================
print("Step 1: Histplots")
for col in numeric_features:
    plt.figure(figsize=(8, 4))
    sns.histplot(X[col].dropna(), kde=True, color='royalblue')
    plt.show()

In [None]:
# ==========================================
# STEP 2: TARGET COUNTS
# ==========================================
print("\nStep 2: Target Counts")
print(y.value_counts())
sns.countplot(x=y)
plt.show()

In [None]:
# ==========================================
# STEP 3: BOXPLOT (Before)
# ==========================================
print("\nStep 3: Boxplots (Before)")
for col in numeric_features:
    plt.figure(figsize=(8, 2))
    sns.boxplot(x=X[col], color='tomato')
    plt.show()

In [None]:
# ==========================================
# STEP 4: OUTLIER & INF HANDLING (ROBUST VERSION)
# ==========================================
print("\nStep 4: Handling Outliers & Converting Infinity to NaN")

# 1Ô∏è‚É£ Replace inf ‚Üí NaN
X_train[numeric_features] = X_train[numeric_features].replace([np.inf, -np.inf], np.nan)
X_test[numeric_features]  = X_test[numeric_features].replace([np.inf, -np.inf], np.nan)

# 2Ô∏è‚É£ Compute IQR on TRAIN
Q1 = X_train[numeric_features].quantile(0.25)
Q3 = X_train[numeric_features].quantile(0.75)
IQR = Q3 - Q1

# 3Ô∏è‚É£ Keep only valid columns (IQR > 0 and not NaN)
valid_cols = IQR[(IQR > 0) & (~IQR.isna())].index

# 4Ô∏è‚É£ Clip only valid columns
lower = Q1[valid_cols] - 1.5 * IQR[valid_cols]
upper = Q3[valid_cols] + 1.5 * IQR[valid_cols]

X_train[valid_cols] = X_train[valid_cols].clip(lower, upper, axis=1)
X_test[valid_cols]  = X_test[valid_cols].clip(lower, upper, axis=1)


In [None]:
# ==========================================
# STEP 5: RE-CHECK TARGET COUNTS
# ==========================================
print(f"Total Unique Classes: {y.nunique()}")
print("-" * 30)
print(y.value_counts())

In [None]:
# ==========================================
# STEP 6: RE-CHECK BOXPLOTS (AFTER CLEANING)
# ==========================================
print("\nStep 6: Final Visual Checks")

for col in numeric_features:
    plt.figure(figsize=(8, 2))
    sns.boxplot(x=X_train[col], color='limegreen')
    plt.title(f"Boxplot of {col}")
    plt.show()


In [None]:
# ==========================================
# STEP 7: NORMAL PAIRPLOT
# ==========================================
print("\nStep 7: Generating Normal Pairplot")

plot_df = pd.concat([X_train, y_train], axis=1).sample(
    min(500, len(X_train)),
    random_state=42
)

sns.pairplot(plot_df, hue='target column')
plt.show()


In [None]:
# ==========================================
# STEP 8: HEATMAP (Fixed)
# ==========================================
print("\nStep 8: Corrected Heatmap")
plt.figure(figsize=(10, 8))
sns.heatmap(X.corr(numeric_only=True), annot=True, cmap='RdYlBu', center=0, square=True)
plt.show()

In [None]:
# from sklearn.linear_model import LogisticRegression
# model = LogisticRegression(
#     C=1.0,
#     penalty='l2',
#     solver='lbfgs',
#     max_iter=1000,
#     n_jobs=-1
# )

# from sklearn.ensemble import RandomForestClassifier
# model = RandomForestClassifier(
#     n_estimators=400,
#     max_depth=None,
#     min_samples_split=5,
#     min_samples_leaf=2,
#     max_features='sqrt',
#     random_state=42,
#     n_jobs=-1
# )

from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.9,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42
)

# from sklearn.ensemble import AdaBoostClassifier
# model = AdaBoostClassifier(
#     n_estimators=300,
#     learning_rate=0.05,
#     random_state=42
# )

# from sklearn.ensemble import ExtraTreesClassifier
# model = ExtraTreesClassifier(
#     n_estimators=500,
#     max_depth=None,
#     min_samples_split=5,
#     min_samples_leaf=2,
#     max_features='sqrt',
#     random_state=42,
#     n_jobs=-1
# )

# from xgboost import XGBClassifier
# model = XGBClassifier(
#     n_estimators=600,
#     learning_rate=0.05,
#     max_depth=6,
#     subsample=0.9,
#     colsample_bytree=0.9,
#     reg_lambda=1,
#     objective='multi:softprob',
#     eval_metric='mlogloss',
#     tree_method='hist',
#     random_state=42,
#     n_jobs=-1
# )

# from lightgbm import LGBMClassifier
# model = LGBMClassifier(
#     n_estimators=600,
#     learning_rate=0.05,
#     max_depth=-1,
#     num_leaves=63,
#     subsample=0.9,
#     colsample_bytree=0.9,
#     random_state=42,
#     n_jobs=-1
# )

# from catboost import CatBoostClassifier
# model = CatBoostClassifier(
#     iterations=600,
#     learning_rate=0.05,
#     depth=6,
#     loss_function='MultiClass',
#     random_seed=42,
#     verbose=False
# )


In [None]:
preprocessing=ColumnTransformer(transformers=[
    ('num',numerical_pipeline,numeric_features),
    ('cat',categorical_pipeline,categorical_features)
])

In [None]:
pipeline=Pipeline(steps=[
    ('preprocessor',preprocessing),
    ('model',model)
])

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)  # fit on train
y_test_enc = le.transform(y_test)        # transform test

In [None]:
# Predict class labels
y_pred = pipeline.predict(X_test)
# Predict class probabilities (needed for log-loss, AUC, calibration)
y_pred_proba = pipeline.predict_proba(X_test)

In [None]:
# Accuracy
acc = accuracy_score(y_test_enc, y_pred)

# Log Loss
ll = log_loss(y_test_enc, y_pred_proba)

# Precision, Recall, F1 (weighted = handles class imbalance)
prec = precision_score(y_test_enc, y_pred, average='weighted')
rec  = recall_score(y_test_enc, y_pred, average='weighted')
f1   = f1_score(y_test_enc, y_pred, average='weighted')

print("Accuracy :", acc)
print("Log Loss :", ll)
print("Precision:", prec)
print("Recall   :", rec)
print("F1 Score :", f1)


In [None]:
final_preds=pipeline.predict(test)
final_probs=pipeline.predict_proba(test)

In [None]:
# ==========================================
# STEP 10: FINAL DATA PREPARATION
# (NO ROW DROPPED, NO NaNs INTRODUCED)
# ==========================================
print("\nStep 10: Preparing Final Submission DataFrames...")
# Decode predicted labels
decoded_labels = le.inverse_transform(final_preds)
# Highest confidence per row
highest_probs = np.max(final_probs, axis=1)
class_names = le.classes_


In [None]:
# ==========================================
# SUBMISSION 1: ID + PREDICTED CLASS
# ==========================================
submission1_df = pd.DataFrame({
    'id': test_id,
    'fruit_name': decoded_labels
})

In [None]:
# ==========================================
# SUBMISSION 2: ID + ALL CLASS PROBABILITIES log loss jaisa
# ==========================================
prob_cols = {
    f"Status_{cls}": final_probs[:, i]
    for i, cls in enumerate(class_names)
}

submission2_df = pd.DataFrame(prob_cols)
submission2_df.insert(0, 'id', test_id)


In [None]:
# ==========================================
# SUBMISSION 3: ID + CLASS + CONFIDENCE random sa kuch to hai
# ==========================================
submission3_df = pd.DataFrame({
    'id': test_id,
    'Predicted_Class': decoded_labels,
    'Confidence_Score': highest_probs
})


In [None]:
# ==========================================
# STEP 11: EXPORT FILES 1- for prediction
# 2- log loss styled
# 3- just all the probablity in one line
# ==========================================

submission1_df.to_csv("submission1.csv", index=False)
submission2_df.to_csv("submission2.csv", index=False)
submission3_df.to_csv("submission3.csv", index=False)
print("All submissions are generated")


In [None]:
# Q1. Amazon uses a deep learning model for product recommendations. Performance improves as more data is added. Why is deep learning suitable here?
# Answer: Deep learning can automatically learn complex patterns from large data

# Q2. A deep neural network trains well but performs poorly on unseen data. What is the MOST likely issue?
# Answer: Overfitting due to high model complexity

# Q3. Why are ReLU activations commonly used in deep networks?
# Answer: They reduce vanishing gradient problems

# Q4. Amazon uses reinforcement learning to optimise warehouse robot paths. What represents the reward?
# Answer: Time or energy saved after reaching the destination efficiently

# Q5. In reinforcement learning, why is exploration important?
# Answer: To discover better actions that may give higher long-term rewards

# Q6. An RL agent always chooses the same action even when it‚Äôs not optimal. What is the MOST likely cause?
# Answer: Insufficient exploration (agent stuck exploiting)

# Q7. Amazon uses computer vision to detect damaged packages from images. Which deep learning model is MOST suitable?
# Answer: Convolutional Neural Network (CNN)

# Q8. Why are convolution layers effective for image processing?
# Answer: They detect local patterns like edges and textures

# Q9. A CV model performs well on training images but fails on real warehouse images with different lighting. What is the MAIN issue?
# Answer: Overfitting to training conditions (poor generalization)

# Q10. Amazon forecasts daily product demand using historical sales data. Why are holidays and promotions important features?
# Answer: They explain sudden demand spikes

# Q11. Demand forecasts work well normally but fail during major sales events like Prime Day. Why?
# Answer: Rare events not well represented in training data

# Q12. Why does increasing K generally reduce variance but increase bias?
# Answer: Predictions rely on broader neighborhood averaging

# Q13. Why is forecasting demand for new products difficult?
# Answer: No historical sales data (cold start problem)

# Q14. Amazon wants to reduce stockouts. Which forecasting error is more dangerous?
# Answer: Large under-forecast causing lost sales

# Q15. Why are deep learning models sometimes avoided for simple forecasting tasks?
# Answer: They may be unnecessarily complex for simple patterns

# Q16. Adding a new feature reduces training error but increases test error. What does this indicate?
# Answer: Overfitting due to increased variance

# Q17. Which situation can break a supervised model in production even if accuracy was high during testing?
# Answer: Concept drift over time

# Q18. Which combination BEST matches the task?
# Answer: CNN for package damage detection, reinforcement learning for warehouse optimization, and deep learning for demand patterns

# Q19. A demand model shows excellent accuracy but is rejected by operations. Why?
# Answer: Predictions violate supply chain constraints (storage, delivery limits)

# Q20. After adding more training data, a deep demand forecasting model becomes unstable in production. What is the MOST likely deep-learning‚Äìspecific root cause?
# Answer: Batch normalization statistics differ between training and inference due to non-stationary data