# Feature Engineering Pipeline

This notebook implements the feature engineering pipeline for the telecom data analysis project. We'll create features from:
1. Network Performance Data
2. Customer Experience Data
3. Call Detail Records

These features will be used for training machine learning models to predict network issues and customer churn.

## 1. Setup and Data Loading

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Set plot style
plt.style.use('default')
sns.set_palette('husl')

In [2]:
# Load the datasets
network_df = pd.read_csv("../../data/raw/sample_data/network_performance_sample.csv")
customer_df = pd.read_csv("../../data/raw/sample_data/customer_experience_sample.csv")
cdr_df = pd.read_csv("../../data/raw/sample_data/call_detail_records_sample.csv")

# Convert timestamps to datetime
network_df["timestamp"] = pd.to_datetime(network_df["timestamp"])
customer_df["timestamp"] = pd.to_datetime(customer_df["timestamp"])
cdr_df["timestamp"] = pd.to_datetime(cdr_df["timestamp"])

## 2. Feature Engineering

### 2.1 Time-based Features

In [3]:
# Extract time-based features from timestamps
def add_time_features(df, timestamp_col='timestamp'):
    df = df.copy()
    df['hour'] = df[timestamp_col].dt.hour
    df['day_of_week'] = df[timestamp_col].dt.dayofweek
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    df['is_peak_hour'] = df['hour'].between(9, 17).astype(int)
    return df

# Apply time features to all datasets
network_df = add_time_features(network_df)
customer_df = add_time_features(customer_df)
cdr_df = add_time_features(cdr_df)

### 2.2 Network Performance Features

In [4]:
# Calculate rolling statistics for network performance metrics
def add_network_features(df):
    df = df.copy()
    
    # Group by cell_id and calculate rolling statistics
    grouped = df.groupby('cell_id')
    
    # Calculate rolling means for key metrics
    df['rolling_latency'] = grouped['latency_ms'].transform(lambda x: x.rolling(window=3, min_periods=1).mean())
    df['rolling_packet_loss'] = grouped['packet_loss'].transform(lambda x: x.rolling(window=3, min_periods=1).mean())
    df['rolling_throughput'] = grouped['throughput_mbps'].transform(lambda x: x.rolling(window=3, min_periods=1).mean())
    
    # Calculate performance degradation indicators
    df['latency_increase'] = df['latency_ms'] > df['rolling_latency']
    df['packet_loss_increase'] = df['packet_loss'] > df['rolling_packet_loss']
    df['throughput_decrease'] = df['throughput_mbps'] < df['rolling_throughput']
    
    return df

network_df = add_network_features(network_df)

### 2.3 Customer Experience Features

In [5]:
# Calculate customer behavior patterns
def add_customer_features(df):
    df = df.copy()
    
    # Group by customer_id and calculate statistics
    grouped = df.groupby('customer_id')
    
    # Calculate usage patterns
    df['avg_data_usage'] = grouped['data_usage_mb'].transform('mean')
    df['avg_voice_minutes'] = grouped['voice_minutes'].transform('mean')
    df['avg_sms_count'] = grouped['sms_count'].transform('mean')
    
    # Calculate satisfaction trends
    df['satisfaction_trend'] = grouped['customer_satisfaction_score'].transform(
        lambda x: x.diff().fillna(0)
    )
    
    return df

customer_df = add_customer_features(customer_df)

### 2.4 Call Detail Record Features

In [6]:
# Calculate call quality and usage patterns
def add_cdr_features(df):
    df = df.copy()
    
    # Group by caller_id and calculate statistics
    grouped = df.groupby('caller_id')
    
    # Calculate call patterns
    df['avg_call_duration'] = grouped['call_duration_seconds'].transform('mean')
    df['total_data_used'] = grouped['data_used_mb'].transform('sum')
    
    # Calculate call quality indicators
    df['is_long_call'] = df['call_duration_seconds'] > df['avg_call_duration']
    df['is_high_data_usage'] = df['data_used_mb'] > df['total_data_used']
    
    return df

cdr_df = add_cdr_features(cdr_df)

### 2.5 Cross-dataset Features

In [7]:
# Combine features from different datasets
def create_cross_dataset_features(network_df, customer_df, cdr_df):
    # Merge network and customer data based on timestamp
    merged_df = pd.merge_asof(
        customer_df.sort_values('timestamp'),
        network_df.sort_values('timestamp'),
        on='timestamp',
        direction='nearest'
    )
    
    # Add CDR data
    merged_df = pd.merge_asof(
        merged_df.sort_values('timestamp'),
        cdr_df.sort_values('timestamp'),
        on='timestamp',
        direction='nearest'
    )
    
    # Calculate cross-dataset features
    merged_df['network_quality_impact'] = (
        merged_df['latency_ms'] * merged_df['packet_loss'] / merged_df['throughput_mbps']
    )
    
    merged_df['customer_experience_score'] = (
        merged_df['customer_satisfaction_score'] * (1 - merged_df['network_quality_impact'])
    )
    
    return merged_df

# Create the final feature set
final_features = create_cross_dataset_features(network_df, customer_df, cdr_df)

# Display the final feature set
print("Final feature set shape:", final_features.shape)
print("\nFeature columns:")
print(final_features.columns.tolist())

Final feature set shape: (6, 50)

Feature columns:
['customer_id', 'timestamp', 'service_type', 'data_usage_mb', 'voice_minutes', 'sms_count', 'network_quality_score', 'customer_satisfaction_score', 'hour_x', 'day_of_week_x', 'is_weekend_x', 'is_peak_hour_x', 'avg_data_usage', 'avg_voice_minutes', 'avg_sms_count', 'satisfaction_trend', 'cell_id_x', 'latency_ms', 'packet_loss', 'throughput_mbps', 'signal_strength', 'connection_type', 'hour_y', 'day_of_week_y', 'is_weekend_y', 'is_peak_hour_y', 'rolling_latency', 'rolling_packet_loss', 'rolling_throughput', 'latency_increase', 'packet_loss_increase', 'throughput_decrease', 'call_id', 'caller_id', 'callee_id', 'call_duration_seconds', 'call_type', 'cell_id_y', 'roaming_status', 'data_used_mb', 'hour', 'day_of_week', 'is_weekend', 'is_peak_hour', 'avg_call_duration', 'total_data_used', 'is_long_call', 'is_high_data_usage', 'network_quality_impact', 'customer_experience_score']
