# Model Training Pipeline

This notebook implements the model training pipeline for two main tasks:
1. Predicting network issues using network performance data
2. Predicting customer churn using customer experience data

We'll use various baseline models and evaluate their performance.

## 1. Setup and Data Loading

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Set plot style
plt.style.use('default')

In [2]:
# Load the datasets
network_df = pd.read_csv("../../data/raw/sample_data/network_performance_sample.csv")
customer_df = pd.read_csv("../../data/raw/sample_data/customer_experience_sample.csv")
cdr_df = pd.read_csv("../../data/raw/sample_data/call_detail_records_sample.csv")

# Convert timestamps to datetime
network_df["timestamp"] = pd.to_datetime(network_df["timestamp"])
customer_df["timestamp"] = pd.to_datetime(customer_df["timestamp"])
cdr_df["timestamp"] = pd.to_datetime(cdr_df["timestamp"])

## 2. Network Issue Prediction

In [3]:
# Define network issues based on performance metrics
def identify_network_issues(df):
    df = df.copy()
    # Define thresholds for network issues using more lenient values
    df['is_issue'] = (
        (df['latency_ms'] > 50) |  # High latency (reduced from 100)
        (df['packet_loss'] > 0.02) |  # High packet loss (reduced from 0.05)
        (df['throughput_mbps'] < 50)  # Low throughput (increased from 5)
    ).astype(int)
    return df

# Prepare network data for modeling
network_issues_df = identify_network_issues(network_df)

# Select features for network issue prediction
#network_features = ['latency_ms', 'packet_loss', 'throughput_mbps', 'signal_strength_dbm']
network_features = ['latency_ms', 'packet_loss', 'throughput_mbps', 'signal_strength']
X_network = network_issues_df[network_features]
y_network = network_issues_df['is_issue']

# Split the data
X_train_net, X_test_net, y_train_net, y_test_net = train_test_split(
    X_network, y_network, test_size=0.2, random_state=42
)

# Scale the features
scaler = StandardScaler()
X_train_net_scaled = scaler.fit_transform(X_train_net)
X_test_net_scaled = scaler.transform(X_test_net)

In [4]:
# Train and evaluate network issue prediction models
def train_and_evaluate_models(X_train, X_test, y_train, y_test):
    models = {
        'Logistic Regression': LogisticRegression(),
        'Random Forest': RandomForestClassifier(),
        'SVM': SVC(probability=True),
        'XGBoost': XGBClassifier()
    }
    
    results = {}
    for name, model in models.items():
        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]
        
        # Calculate metrics
        results[name] = {
            'Accuracy': accuracy_score(y_test, y_pred),
            'Precision': precision_score(y_test, y_pred),
            'Recall': recall_score(y_test, y_pred),
            'F1 Score': f1_score(y_test, y_pred),
            'ROC AUC': roc_auc_score(y_test, y_pred_proba)
        }
    
    return pd.DataFrame(results).T

# Train and evaluate network issue models
network_results = train_and_evaluate_models(
    X_train_net_scaled, X_test_net_scaled, y_train_net, y_test_net
)
print("\nNetwork Issue Prediction Results:")
print(network_results)

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(0)

## 3. Customer Churn Prediction

In [None]:
# Prepare customer data for modeling
def prepare_customer_data(customer_df, network_df, cdr_df):
    # Merge customer data with network performance
    merged_df = pd.merge_asof(
        customer_df.sort_values('timestamp'),
        network_df.sort_values('timestamp'),
        on='timestamp',
        direction='nearest'
    )
    
    # Add CDR data
    merged_df = pd.merge_asof(
        merged_df.sort_values('timestamp'),
        cdr_df.sort_values('timestamp'),
        on='timestamp',
        direction='nearest'
    )
    
    # Define churn based on customer satisfaction and usage patterns
    merged_df['is_churn'] = (
        (merged_df['customer_satisfaction_score'] < 3) |
        (merged_df['data_usage_mb'] < merged_df['data_usage_mb'].mean() * 0.5)
    ).astype(int)
    
    return merged_df

# Prepare the data
customer_churn_df = prepare_customer_data(customer_df, network_df, cdr_df)

# Select features for churn prediction
churn_features = [
    'customer_satisfaction_score', 'data_usage_mb', 'voice_minutes',
    'sms_count', 'latency_ms', 'packet_loss', 'throughput_mbps'
]
X_churn = customer_churn_df[churn_features]
y_churn = customer_churn_df['is_churn']

# Split the data
X_train_churn, X_test_churn, y_train_churn, y_test_churn = train_test_split(
    X_churn, y_churn, test_size=0.2, random_state=42
)

# Scale the features
X_train_churn_scaled = scaler.fit_transform(X_train_churn)
X_test_churn_scaled = scaler.transform(X_test_churn)

In [None]:
# Train and evaluate churn prediction models
churn_results = train_and_evaluate_models(
    X_train_churn_scaled, X_test_churn_scaled, y_train_churn, y_test_churn
)
print("\nCustomer Churn Prediction Results:")
print(churn_results)

## 4. Model Comparison and Visualization

In [None]:
# Compare model performance
def plot_model_comparison(network_results, churn_results):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Plot network issue prediction results
    network_results['F1 Score'].plot(kind='bar', ax=ax1)
    ax1.set_title('Network Issue Prediction - F1 Scores')
    ax1.set_ylabel('F1 Score')
    ax1.tick_params(axis='x', rotation=45)
    
    # Plot churn prediction results
    churn_results['F1 Score'].plot(kind='bar', ax=ax2)
    ax2.set_title('Customer Churn Prediction - F1 Scores')
    ax2.set_ylabel('F1 Score')
    ax2.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()

# Plot the results
plot_model_comparison(network_results, churn_results)