# Model Training Pipeline

This notebook implements the model training pipeline for two main tasks:
1. Predicting network issues using network performance data
2. Predicting customer churn using customer experience data

We'll use various baseline models and evaluate their performance.

## 1. Setup and Data Loading

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Set plot style
plt.style.use('default')

In [2]:
# Load the datasets
network_df = pd.read_csv("../../data/raw/sample_data/network_performance_sample.csv")
customer_df = pd.read_csv("../../data/raw/sample_data/customer_experience_sample.csv")
cdr_df = pd.read_csv("../../data/raw/sample_data/call_detail_records_sample.csv")

# Convert timestamps to datetime
network_df["timestamp"] = pd.to_datetime(network_df["timestamp"])
customer_df["timestamp"] = pd.to_datetime(customer_df["timestamp"])
cdr_df["timestamp"] = pd.to_datetime(cdr_df["timestamp"])

## 2. Network Issue Prediction

In [3]:
def identify_network_issues(df):
    df = df.copy()
    # Define thresholds for network issues using more balanced values
    df['is_issue'] = (
        (df['latency_ms'] > 45) |  # High latency
        (df['packet_loss'] > 0.015) |  # High packet loss
        (df['throughput_mbps'] < 55)  # Low throughput
    ).astype(int)
    return df

# Prepare network data for modeling
network_issues_df = identify_network_issues(network_df)

# Check the distribution of network issues
print("Network Issues Distribution:")
print(network_issues_df['is_issue'].value_counts(normalize=True))

Network Issues Distribution:
is_issue
1    0.833333
0    0.166667
Name: proportion, dtype: float64


In [4]:
# Select features for network issue prediction
network_features = ['latency_ms', 'packet_loss', 'throughput_mbps', 'signal_strength']
X_network = network_issues_df[network_features]
y_network = network_issues_df['is_issue']

# Split the data
X_train_net, X_test_net, y_train_net, y_test_net = train_test_split(
    X_network, y_network, test_size=0.2, random_state=42
)

# Scale the features
scaler = StandardScaler()
X_train_net_scaled = scaler.fit_transform(X_train_net)
X_test_net_scaled = scaler.transform(X_test_net)

In [5]:
def train_and_evaluate_models(X_train, X_test, y_train, y_test):
    # Use only the more robust models
    models = {
        'Logistic Regression': LogisticRegression(class_weight='balanced'),
        'Random Forest': RandomForestClassifier(class_weight='balanced'),
        'SVM': SVC(probability=True, class_weight='balanced')
    }
    
    results = {}
    for name, model in models.items():
        try:
            # Train the model
            model.fit(X_train, y_train)
            
            # Make predictions
            y_pred = model.predict(X_test)
            y_pred_proba = model.predict_proba(X_test)[:, 1]
            
            # Calculate metrics with zero_division=0
            results[name] = {
                'Accuracy': accuracy_score(y_test, y_pred),
                'Precision': precision_score(y_test, y_pred, zero_division=0),
                'Recall': recall_score(y_test, y_pred, zero_division=0),
                'F1 Score': f1_score(y_test, y_pred, zero_division=0),
                'ROC AUC': roc_auc_score(y_test, y_pred_proba) if len(np.unique(y_test)) > 1 else 0.5
            }
        except Exception as e:
            print(f"Error with {name}: {str(e)}")
            results[name] = {
                'Accuracy': 0,
                'Precision': 0,
                'Recall': 0,
                'F1 Score': 0,
                'ROC AUC': 0.5
            }
    
    return pd.DataFrame(results).T

In [6]:
# Train and evaluate network issue models
network_results = train_and_evaluate_models(
    X_train_net_scaled, X_test_net_scaled, y_train_net, y_test_net
)
print("\nNetwork Issue Prediction Results:")
print(network_results)


Network Issue Prediction Results:
                     Accuracy  Precision  Recall  F1 Score  ROC AUC
Logistic Regression       0.5        1.0     0.5  0.666667      0.5
Random Forest             0.5        1.0     0.5  0.666667      0.5
SVM                       0.5        1.0     0.5  0.666667      0.5


## 3. Customer Churn Prediction

In [7]:
def train_and_evaluate_models(X_train, X_test, y_train, y_test):
    # Use only the more robust models
    models = {
        'Logistic Regression': LogisticRegression(class_weight='balanced'),
        'Random Forest': RandomForestClassifier(class_weight='balanced'),
        'SVM': SVC(probability=True, class_weight='balanced')
    }
    
    results = {}
    for name, model in models.items():
        try:
            # Train the model
            model.fit(X_train, y_train)
            
            # Make predictions
            y_pred = model.predict(X_test)
            y_pred_proba = model.predict_proba(X_test)[:, 1]
            
            # Calculate metrics with zero_division=0
            results[name] = {
                'Accuracy': accuracy_score(y_test, y_pred),
                'Precision': precision_score(y_test, y_pred, zero_division=0),
                'Recall': recall_score(y_test, y_pred, zero_division=0),
                'F1 Score': f1_score(y_test, y_pred, zero_division=0),
                'ROC AUC': roc_auc_score(y_test, y_pred_proba) if len(np.unique(y_test)) > 1 else 0.5
            }
        except Exception as e:
            print(f"Error with {name}: {str(e)}")
            results[name] = {
                'Accuracy': 0,
                'Precision': 0,
                'Recall': 0,
                'F1 Score': 0,
                'ROC AUC': 0.5
            }
    
    return pd.DataFrame(results).T

In [8]:
# Train and evaluate network issue prediction models
network_results = train_and_evaluate_models(
    X_train_net_scaled, X_test_net_scaled, y_train_net, y_test_net
)
print("\nNetwork Issue Prediction Results:")
print(network_results)


Network Issue Prediction Results:
                     Accuracy  Precision  Recall  F1 Score  ROC AUC
Logistic Regression       0.5        1.0     0.5  0.666667      0.5
Random Forest             0.5        1.0     0.5  0.666667      0.5
SVM                       0.5        1.0     0.5  0.666667      0.5


## 4. Model Comparison and Visualization

In [9]:
def plot_model_comparison(network_results, churn_results=None):
    # Create figure with subplots
    fig, axes = plt.subplots(1, 2 if churn_results is not None else 1, figsize=(15, 5))
    if churn_results is None:
        axes = [axes]  # Make axes iterable even with single subplot
    
    # Plot network results
    network_results.plot(kind='bar', ax=axes[0])
    axes[0].set_title('Network Issue Prediction')
    axes[0].set_ylabel('Score')
    axes[0].tick_params(axis='x', rotation=45)
    
    # Plot churn results if available
    if churn_results is not None:
        churn_results.plot(kind='bar', ax=axes[1])
        axes[1].set_title('Customer Churn Prediction')
        axes[1].set_ylabel('Score')
        axes[1].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()

NameError: name 'churn_results' is not defined

In [None]:
# Plot the results
plot_model_comparison(network_results)