# Machine Learning!

## Access popular AI models via Google-Colab-AI Without an API Key
All users have access to most popular LLMs via the `google-colab-ai` Python library, and paid users have access to a wider selection of models. For more details, refer to the [getting started with google colab ai](https://colab.research.google.com/github/googlecolab/colabtools/blob/main/notebooks/Getting_started_with_google_colab_ai.ipynb).



# **Machine Learning**

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# ML Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from xgboost import XGBClassifier

In [1]:
# 1. Import required libraries
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report



In [2]:
# ============================================
# 1. Import Libraries
# ============================================

import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from tensorflow.keras.datasets import mnist


# ============================================
# 2. Load MNIST Dataset
# ============================================

(X_train, y_train), (X_test, y_test) = mnist.load_data()


# ============================================
# 3. Preprocess Data
# ============================================

# Normalize pixel values (0-255 → 0-1)
X_train = X_train / 255.0
X_test = X_test / 255.0

# Flatten images (28x28 → 784)
X_train = X_train.reshape(X_train.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)


# ============================================
# 4. Create XGBoost Model
# ============================================

model = XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    eval_metric='mlogloss',
    use_label_encoder=False
)


# ============================================
# 5. Train Model
# ============================================

model.fit(X_train, y_train)


# ============================================
# 6. Predict
# ============================================

y_pred = model.predict(X_test)


# ============================================
# 7. Evaluate
# ============================================

accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Test Accuracy: 0.9776


In [None]:
# 2. Load dataset
data = pd.read_csv("data.csv")

# 3. Separate features (X) and target (y)
X = data.drop("target", axis=1)
y = data["target"]



In [None]:
# 4. Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)



In [None]:
# 5. Create XGBoost model
model = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=4,
    random_state=42
)



In [None]:
# 6. Train the model
model.fit(X_train, y_train)



In [None]:
# 7. Make predictions
y_pred = model.predict(X_test)


In [None]:
# 8. Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# **Complete Machine Learning Pipe line**

In [None]:
# =====================================================
# COMPLETE MACHINE LEARNING BENCHMARK PIPELINE
# =====================================================

# -------------------------------
# 1. Import Libraries
# -------------------------------

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

# ML Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from xgboost import XGBClassifier





In [None]:
# -------------------------------
# 2. Load Dataset
# -------------------------------

data = pd.read_csv("data.csv")

# Separate features and target
X = data.drop("target", axis=1)
y = data["target"]


# -------------------------------
# 3. Train/Test Split
# -------------------------------

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y   # important for classification
)



In [None]:

# -------------------------------
# 4. Feature Scaling
# -------------------------------

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)




In [None]:
# -------------------------------
# 5. Define Models
# -------------------------------

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Support Vector Machine": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "XGBoost": XGBClassifier(eval_metric='logloss', use_label_encoder=False)
}



In [None]:

# -------------------------------
# 6. Train, Cross-Validate & Evaluate
# -------------------------------

results = {}

print("\n================ MODEL PERFORMANCE ================\n")

for name, model in models.items():

    # Use scaled data for models sensitive to scale
    if name in ["Logistic Regression", "Support Vector Machine", "K-Nearest Neighbors"]:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        cv_scores = cross_val_score(model, X_train, y_train, cv=5)

    accuracy = accuracy_score(y_test, y_pred)

    results[name] = {
        "Test Accuracy": accuracy,
        "CV Mean Accuracy": cv_scores.mean()
    }

    print(f"{name}")
    print(f"Test Accuracy: {accuracy:.4f}")
    print(f"Cross-Val Accuracy: {cv_scores.mean():.4f}")
    print("-" * 50)




In [None]:
# -------------------------------
# 7. Select Best Model
# -------------------------------

best_model_name = max(results, key=lambda x: results[x]["Test Accuracy"])
print(f"\nBest Model Based on Test Accuracy: {best_model_name}")




In [None]:
# -------------------------------
# 8. Train Best Model Again
# -------------------------------

best_model = models[best_model_name]

if best_model_name in ["Logistic Regression", "Support Vector Machine", "K-Nearest Neighbors"]:
    best_model.fit(X_train_scaled, y_train)
    final_pred = best_model.predict(X_test_scaled)
else:
    best_model.fit(X_train, y_train)
    final_pred = best_model.predict(X_test)

print("\nFinal Classification Report:\n")
print(classification_report(y_test, final_pred))




In [None]:
# -------------------------------
# 9. Feature Importance (Tree Models Only)
# -------------------------------

if best_model_name in ["Decision Tree", "Random Forest", "Gradient Boosting", "XGBoost"]:

    importances = best_model.feature_importances_
    feature_names = X.columns

    importance_df = pd.DataFrame({
        "Feature": feature_names,
        "Importance": importances
    }).sort_values(by="Importance", ascending=False)

    print("\nFeature Importance:\n")
    print(importance_df)

    # Plot
    plt.figure()
    plt.barh(importance_df["Feature"], importance_df["Importance"])
    plt.xlabel("Importance")
    plt.ylabel("Feature")
    plt.title("Feature Importance")
    plt.gca().invert_yaxis()
    plt.show()