In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# ===============================
# MEGA TEMPLATE: BINARY CLASSIFICATION MODELS (KAGGLE / EXAM READY)
# Covers:
# Logistic Regression
# Decision Tree
# Random Forest
# Gradient Boosting
# XGBoost
# Support Vector Machine (SVM)
# K-Nearest Neighbors (KNN)
# Naive Bayes (GaussianNB)
# ===============================

# -------------------------------
# 1. IMPORT LIBRARIES
# -------------------------------
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# Optional (Kaggle favourite)
from xgboost import XGBClassifier

# -------------------------------
# 2. LOAD DATA
# -------------------------------
# Example: df = pd.read_csv("train.csv")

# Assume:
# - target column name = 'target'
# - binary labels: 0 and 1

X = df.drop('target', axis=1)
y = df['target']

# -------------------------------
# 3. HANDLE CATEGORICAL FEATURES
# -------------------------------
# Label Encoding (simple & exam-friendly)
for col in X.select_dtypes(include='object').columns:
    X[col] = LabelEncoder().fit_transform(X[col])

# -------------------------------
# 4. TRAIN-TEST SPLIT
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# -------------------------------
# 5. FEATURE SCALING (Required for some models)
# -------------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)






# =====================================================
# MODEL 1: LOGISTIC REGRESSION
# =====================================================
log_reg = LogisticRegression(
    max_iter=1000,
    penalty='l2',
    class_weight='balanced'
)
log_reg.fit(X_train_scaled, y_train)

y_pred_lr = log_reg.predict(X_test_scaled)
y_prob_lr = log_reg.predict_proba(X_test_scaled)[:, 1]

print("Logistic Regression")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_lr))
print(classification_report(y_test, y_pred_lr))





# =====================================================
# MODEL 2: DECISION TREE
# =====================================================
dt = DecisionTreeClassifier(
    criterion='gini',
    max_depth=6,
    min_samples_split=10
)
dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)

print("Decision Tree")
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))






# =====================================================
# MODEL 3: RANDOM FOREST
# =====================================================
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=12,
    n_jobs=-1,
    random_state=42
)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
y_prob_rf = rf.predict_proba(X_test)[:, 1]

print("Random Forest")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_rf))






# =====================================================
# MODEL 4: GRADIENT BOOSTING
# =====================================================
gb = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=3
)
gb.fit(X_train, y_train)

y_pred_gb = gb.predict(X_test)
y_prob_gb = gb.predict_proba(X_test)[:, 1]

print("Gradient Boosting")
print("Accuracy:", accuracy_score(y_test, y_pred_gb))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_gb))






# =====================================================
# MODEL 5: XGBOOST (STATE-OF-THE-ART)
# =====================================================
xgb = XGBClassifier(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='logloss',
    random_state=42
)
xgb.fit(X_train, y_train)

y_pred_xgb = xgb.predict(X_test)
y_prob_xgb = xgb.predict_proba(X_test)[:, 1]

print("XGBoost")
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_xgb))







# =====================================================
# MODEL 6: SUPPORT VECTOR MACHINE (SVM)
# =====================================================
svm = SVC(
    kernel='rbf',
    C=1.0,
    gamma='scale',
    probability=True
)
svm.fit(X_train_scaled, y_train)

y_pred_svm = svm.predict(X_test_scaled)
y_prob_svm = svm.predict_proba(X_test_scaled)[:, 1]

print("SVM")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_svm))






# =====================================================
# MODEL 7: K-NEAREST NEIGHBORS
# =====================================================
knn = KNeighborsClassifier(
    n_neighbors=7,
    weights='distance'
)
knn.fit(X_train_scaled, y_train)

y_pred_knn = knn.predict(X_test_scaled)

print("KNN")
print("Accuracy:", accuracy_score(y_test, y_pred_knn))






# =====================================================
# MODEL 8: NAIVE BAYES
# =====================================================
nb = GaussianNB()
nb.fit(X_train_scaled, y_train)

y_pred_nb = nb.predict(X_test_scaled)

print("Naive Bayes")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))

# ===============================
# END OF MEGA BINARY CLASSIFICATION TEMPLATE
# ===============================


In [None]:
# ===============================
# MEGA TEMPLATE: MULTICLASS CLASSIFICATION MODELS (KAGGLE / EXAM READY)
# Covers:
# Logistic Regression (Softmax)
# Decision Tree
# Random Forest
# Gradient Boosting
# XGBoost (Multiclass)
# Support Vector Machine (OvR / OvO)
# K-Nearest Neighbors (KNN)
# Naive Bayes (GaussianNB)
# ===============================

# -------------------------------
# 1. IMPORT LIBRARIES
# -------------------------------
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# Optional (Kaggle favourite)
from xgboost import XGBClassifier

# -------------------------------
# 2. LOAD DATA
# -------------------------------
# Example:
# df = pd.read_csv("train.csv")

# Assume:
# - target column name = 'target'
# - target has more than 2 classes

X = df.drop('target', axis=1)
y = df['target']

# -------------------------------
# 3. HANDLE CATEGORICAL FEATURES
# -------------------------------
for col in X.select_dtypes(include='object').columns:
    X[col] = LabelEncoder().fit_transform(X[col])

# Encode target if needed
if y.dtype == 'object':
    y = LabelEncoder().fit_transform(y)

# -------------------------------
# 4. TRAIN-TEST SPLIT
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# -------------------------------
# 5. FEATURE SCALING (Required for some models)
# -------------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# =====================================================
# MODEL 1: LOGISTIC REGRESSION (SOFTMAX)
# =====================================================
log_reg = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    max_iter=1000
)
log_reg.fit(X_train_scaled, y_train)

y_pred_lr = log_reg.predict(X_test_scaled)

print("Logistic Regression (Softmax)")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

# =====================================================
# MODEL 2: DECISION TREE
# =====================================================
dt = DecisionTreeClassifier(
    criterion='gini',
    max_depth=8,
    min_samples_split=10
)
dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)

print("Decision Tree")
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))

# =====================================================
# MODEL 3: RANDOM FOREST
# =====================================================
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=15,
    n_jobs=-1,
    random_state=42
)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

print("Random Forest")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

# =====================================================
# MODEL 4: GRADIENT BOOSTING
# =====================================================
gb = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=3
)
gb.fit(X_train, y_train)

y_pred_gb = gb.predict(X_test)

print("Gradient Boosting")
print("Accuracy:", accuracy_score(y_test, y_pred_gb))
print(classification_report(y_test, y_pred_gb))

# =====================================================
# MODEL 5: XGBOOST (MULTICLASS)
# =====================================================
num_classes = len(np.unique(y_train))

xgb = XGBClassifier(
    objective='multi:softprob',
    num_class=num_classes,
    n_estimators=500,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='mlogloss',
    random_state=42
)
xgb.fit(X_train, y_train)

y_pred_xgb = xgb.predict(X_test)

print("XGBoost (Multiclass)")
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))

# =====================================================
# MODEL 6: SUPPORT VECTOR MACHINE (OvR)
# =====================================================
svm = SVC(
    kernel='rbf',
    C=1.0,
    gamma='scale'
)
svm.fit(X_train_scaled, y_train)

y_pred_svm = svm.predict(X_test_scaled)

print("SVM (One-vs-Rest)")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))

# =====================================================
# MODEL 7: K-NEAREST NEIGHBORS
# =====================================================
knn = KNeighborsClassifier(
    n_neighbors=7,
    weights='distance'
)
knn.fit(X_train_scaled, y_train)

y_pred_knn = knn.predict(X_test_scaled)

print("KNN")
print("Accuracy:", accuracy_score(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn))

# =====================================================
# MODEL 8: NAIVE BAYES
# =====================================================
nb = GaussianNB()
nb.fit(X_train_scaled, y_train)

y_pred_nb = nb.predict(X_test_scaled)

print("Naive Bayes")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))

# ===============================
# END OF MEGA MULTICLASS CLASSIFICATION TEMPLATE
# ===============================


In [None]:
# ============================================
# MEGA TEMPLATE: OUTLIER ANALYSIS (EDA + HANDLING)
# Kaggle / Exam / Notebook Beautiful Plots
# ============================================

# -------------------------------
# 1. IMPORT LIBRARIES
# -------------------------------
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

# -------------------------------
# 2. LOAD DATA
# -------------------------------
# Example:
# df = pd.read_csv('train.csv')

# Select only numerical columns for outlier analysis
num_cols = df.select_dtypes(include=['int64', 'float64']).columns

# -------------------------------
# 3. BASIC STATISTICAL SUMMARY
# -------------------------------
print("Statistical Summary:\n")
print(df[num_cols].describe())

# -------------------------------
# 4. UNIVARIATE OUTLIER VISUALIZATION
# -------------------------------
# 4.1 Boxplots (Most important)
plt.figure(figsize=(15, 6))
df[num_cols].boxplot(rot=90)
plt.title('Boxplot of Numerical Features')
plt.tight_layout()
plt.show()

# 4.2 Histograms + KDE
for col in num_cols:
    plt.figure(figsize=(6, 4))
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.show()

# -------------------------------
# 5. IQR METHOD (CLASSIC EXAM METHOD)
# -------------------------------
outlier_summary = {}

for col in num_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    outlier_summary[col] = len(outliers)

print("Outlier count using IQR method:\n")
for k, v in outlier_summary.items():
    print(f"{k}: {v}")

# -------------------------------
# 6. Z-SCORE METHOD
# -------------------------------
zscore_outliers = {}

for col in num_cols:
    z_scores = np.abs(stats.zscore(df[col].dropna()))
    count = np.sum(z_scores > 3)
    zscore_outliers[col] = count

print("\nOutlier count using Z-score method:\n")
for k, v in zscore_outliers.items():
    print(f"{k}: {v}")

# -------------------------------
# 7. MULTIVARIATE OUTLIERS (SCATTER)
# -------------------------------
# Example: first two numerical columns
if len(num_cols) >= 2:
    plt.figure(figsize=(6, 5))
    sns.scatterplot(x=df[num_cols[0]], y=df[num_cols[1]])
    plt.title('Scatter Plot for Multivariate Outliers')
    plt.show()

# -------------------------------
# 8. OUTLIER HANDLING TECHNIQUES
# -------------------------------

# 8.1 REMOVE OUTLIERS USING IQR

def remove_outliers_iqr(data, col):
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return data[(data[col] >= lower) & (data[col] <= upper)]

# Apply iteratively (use carefully!)
df_iqr_cleaned = df.copy()
for col in num_cols:
    df_iqr_cleaned = remove_outliers_iqr(df_iqr_cleaned, col)

print("Shape before IQR cleaning:", df.shape)
print("Shape after IQR cleaning:", df_iqr_cleaned.shape)

# -------------------------------
# 8.2 CAPPING (WINSORIZATION)
# -------------------------------

def cap_outliers(data, col):
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    data[col] = np.where(data[col] < lower, lower, data[col])
    data[col] = np.where(data[col] > upper, upper, data[col])
    return data

# Apply capping
df_capped = df.copy()
for col in num_cols:
    df_capped = cap_outliers(df_capped, col)

# -------------------------------
# 8.3 LOG TRANSFORMATION (FOR SKEWED DATA)
# -------------------------------
for col in num_cols:
    if (df[col] > 0).all():
        df[f'{col}_log'] = np.log1p(df[col])

# -------------------------------
# 9. MODEL-SAFE APPROACH (TREE MODELS)
# -------------------------------
# NOTE:
# - Tree-based models (Decision Tree, Random Forest, XGBoost)
#   are robust to outliers.
# - Scaling + outlier removal is CRITICAL for:
#   Logistic Regression, SVM, KNN

# -------------------------------
# 10. FINAL CHECK (AFTER CLEANING)
# -------------------------------
plt.figure(figsize=(15, 6))
df_capped[num_cols].boxplot(rot=90)
plt.title('Boxplot After Outlier Treatment (Capped)')
plt.tight_layout()
plt.show()

# ============================================
# END OF OUTLIER ANALYSIS MEGA TEMPLATE
# ============================================


In [None]:
# =====================================================
# MEGA TEMPLATE: EDA PLOTTING (ALL IMPORTANT GRAPHS)
# Kaggle | Exam | Notebook-Beautiful Visuals
# =====================================================

# -------------------------------
# 1. IMPORT LIBRARIES
# -------------------------------
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

# -------------------------------
# 2. LOAD DATA
# -------------------------------
# Example:
# df = pd.read_csv('train.csv')

# Separate numerical & categorical columns
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
cat_cols = df.select_dtypes(include=['object', 'category']).columns

# -------------------------------
# 3. UNIVARIATE PLOTS
# -------------------------------

# 3.1 Histogram
for col in num_cols:
    plt.figure(figsize=(6,4))
    plt.hist(df[col], bins=30)
    plt.title(f'Histogram of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()

# 3.2 KDE Plot
for col in num_cols:
    plt.figure(figsize=(6,4))
    sns.kdeplot(df[col], fill=True)
    plt.title(f'KDE Plot of {col}')
    plt.show()

# 3.3 Box Plot (Outliers)
plt.figure(figsize=(14,6))
df[num_cols].boxplot(rot=90)
plt.title('Boxplot of Numerical Features')
plt.show()

# -------------------------------
# 4. CATEGORICAL PLOTS
# -------------------------------

# 4.1 Count Plot
for col in cat_cols:
    plt.figure(figsize=(6,4))
    sns.countplot(x=df[col])
    plt.title(f'Count Plot of {col}')
    plt.xticks(rotation=45)
    plt.show()

# 4.2 Bar Plot (Mean of target vs category)
# Example assumes target column exists
# Replace 'target' with your label

if 'target' in df.columns:
    for col in cat_cols:
        plt.figure(figsize=(6,4))
        sns.barplot(x=df[col], y=df['target'])
        plt.title(f'{col} vs Target')
        plt.xticks(rotation=45)
        plt.show()

# -------------------------------
# 5. BIVARIATE PLOTS
# -------------------------------

# 5.1 Scatter Plot
if len(num_cols) >= 2:
    plt.figure(figsize=(6,5))
    plt.scatter(df[num_cols[0]], df[num_cols[1]])
    plt.xlabel(num_cols[0])
    plt.ylabel(num_cols[1])
    plt.title('Scatter Plot')
    plt.show()

# 5.2 Scatter with Hue (Target)
if 'target' in df.columns and len(num_cols) >= 2:
    plt.figure(figsize=(6,5))
    sns.scatterplot(x=df[num_cols[0]], y=df[num_cols[1]], hue=df['target'])
    plt.title('Scatter Plot with Target Hue')
    plt.show()

# 5.3 Line Plot (Time Series)
# Use when one column represents time/index

# df.sort_values('date_column', inplace=True)
# plt.plot(df['date_column'], df['value_column'])
# plt.title('Line Plot (Time Series)')
# plt.show()

# -------------------------------
# 6. MULTIVARIATE PLOTS
# -------------------------------

# 6.1 Pair Plot
sns.pairplot(df[num_cols])
plt.show()

# 6.2 Heatmap (Correlation Matrix)
plt.figure(figsize=(10,8))
corr = df[num_cols].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# -------------------------------
# 7. ADVANCED / SPECIAL PLOTS
# -------------------------------

# 7.1 Violin Plot
if 'target' in df.columns and len(num_cols) > 0:
    plt.figure(figsize=(6,4))
    sns.violinplot(x=df['target'], y=df[num_cols[0]])
    plt.title('Violin Plot')
    plt.show()

# 7.2 Strip Plot
if 'target' in df.columns and len(num_cols) > 0:
    plt.figure(figsize=(6,4))
    sns.stripplot(x=df['target'], y=df[num_cols[0]])
    plt.title('Strip Plot')
    plt.show()

# 7.3 ECDF Plot
for col in num_cols:
    plt.figure(figsize=(6,4))
    sns.ecdfplot(df[col])
    plt.title(f'ECDF of {col}')
    plt.show()

# -------------------------------
# 8. DIMENSIONALITY REDUCTION VISUALS
# -------------------------------
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# PCA (2D)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(df[num_cols].dropna())

plt.figure(figsize=(6,5))
plt.scatter(X_pca[:,0], X_pca[:,1])
plt.title('PCA Projection (2D)')
plt.show()

# t-SNE (for visualization)
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
X_tsne = tsne.fit_transform(df[num_cols].dropna())

plt.figure(figsize=(6,5))
plt.scatter(X_tsne[:,0], X_tsne[:,1])
plt.title('t-SNE Visualization')
plt.show()

# =====================================================
# END OF EDA PLOTTING MEGA TEMPLATE
# =====================================================
