# Server Crash Prediction - Data Exploration

This notebook explores the cloud workload dataset for server crash prediction using machine learning.

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

## 1. Data Loading

In [None]:
# Assuming the Google Cluster Data is downloaded and placed in the 'data/' directory.
# You might need to adjust the file path and name based on the actual downloaded data.
try:
    file_path = '../data/cloud_workload_dataset.csv'
    df = pd.read_csv(file_path)
    print(f"Successfully loaded data from {file_path}")
    print(df.head())
    print(df.info())
except FileNotFoundError:
    print(f"Error: {file_path} not found. Please ensure the dataset is downloaded and placed in the 'data/' directory.")
    print("You can download the 'Microservices Bottleneck Detection Dataset' from: https://www.kaggle.com/datasets/gagansomashekar/microservices-bottleneck-detection-dataset")
    raise
except Exception as e:
    print(f"An error occurred during data loading: {e}")
    raise

## 2. Data Exploration and Feature Engineering

In this section we will:
- Check for missing values
- Explore the dataset structure and columns
- Create a binary target variable from `Error_Rate (%)` (high error rate = 1, low = 0)
- Identify numerical and categorical features
- One-hot encode categorical variables (Data_Source, Job_Priority, Scheduler_Type, Resource_Allocation_Type)
- Prepare features (X) and target (y) for model training

In [None]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())
print("\nDataset shape:", df.shape)
print("\nColumn names:", df.columns.tolist())

# Create binary target variable from Error_Rate (%)
# High error rate (above 75th percentile) indicates potential server issues
error_threshold = df['Error_Rate (%)'].quantile(0.75)
df['high_error'] = (df['Error_Rate (%)'] >= error_threshold).astype(int)

print(f"\nError rate threshold (75th percentile): {error_threshold:.2f}%")
print(f"Target distribution:\n{df['high_error'].value_counts()}")

# Define features and target
# Exclude: Job_ID (identifier), timestamps, Error_Rate (%) (used to create target), high_error (target)
exclude_cols = ['Job_ID', 'Task_Start_Time', 'Task_End_Time', 'Error_Rate (%)', 'high_error']
features = [col for col in df.columns if col not in exclude_cols]

target = 'high_error'

print(f"\nFeatures to use: {features}")
print(f"Target variable: {target}")

# Separate numerical and categorical features
categorical_features = ['Data_Source', 'Job_Priority', 'Scheduler_Type', 'Resource_Allocation_Type']
numerical_features = [col for col in features if col not in categorical_features]

print(f"\nNumerical features: {numerical_features}")
print(f"Categorical features: {categorical_features}")

# One-hot encode categorical variables
df_encoded = pd.get_dummies(df, columns=categorical_features, drop_first=True)

# Update feature list after encoding
features_encoded = [col for col in df_encoded.columns if col not in exclude_cols and col != target]

X = df_encoded[features_encoded]
y = df_encoded[target]

print(f"\nFinal feature count: {len(features_encoded)}")
print(f"Final dataset shape: X={X.shape}, y={y.shape}")

## 3. Data Preprocessing and Standardization

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize StandardScaler
# Note: StandardScaler is applied to all features (numerical + one-hot encoded categorical)
# This is fine for one-hot encoded features as they're already 0/1
scaler = StandardScaler()

# Fit on training data and transform both training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\nData standardized using StandardScaler.")
print(f"Shape of X_train_scaled: {X_train_scaled.shape}")
print(f"Shape of X_test_scaled: {X_test_scaled.shape}")
print(f"Target distribution in training set:\n{y_train.value_counts()}")
print(f"\nTarget distribution in test set:\n{y_test.value_counts()}")

## 4. Model Implementation (RandomForestClassifier)

In [None]:
print("\nInitializing RandomForestClassifier...")
# You can tune hyperparameters like n_estimators, max_depth, etc.
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')

# Train the model
model.fit(X_train_scaled, y_train)
print("RandomForestClassifier trained successfully.")

## 5. Model Evaluation

In [None]:
print("\nEvaluating the model...")
y_pred = model.predict(X_test_scaled)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
# Visualize Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Predicted Normal', 'Predicted High Error'],
            yticklabels=['Actual Normal', 'Actual High Error'])
plt.title('Confusion Matrix - Server Error Prediction')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
# Feature Importance Analysis
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))

# Visualize feature importance
plt.figure(figsize=(10, 6))
top_features = feature_importance.head(15)
plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Feature Importance')
plt.title('Top 15 Most Important Features for Server Error Prediction')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## Further Steps

- Hyperparameter tuning for RandomForestClassifier (e.g., using GridSearchCV or RandomizedSearchCV)
- Feature importance analysis: `model.feature_importances_`
- Cross-validation
- Experiment with other tree-based models (e.g., GradientBoostingClassifier, XGBoost, LightGBM)
- Deploy the trained model (e.g., save using joblib or pickle)