In [85]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [86]:
data = pd.read_csv("/content/telecom_data.csv")
data.head()

Unnamed: 0,call_duration,data_usage,num_calls,sms_count,customer_id
0,39,0.14,13,1,1
1,52,2.51,9,24,2
2,29,4.93,21,15,3
3,15,1.46,9,45,4
4,43,2.04,8,15,5


In [87]:
customer_ids = data['customer_id'].values

In [88]:
features = data[['call_duration', 'data_usage', 'num_calls', 'sms_count']].values

In [89]:
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

In [90]:
features_scaled

array([[ 0.51344781, -1.71046127, -0.23855445, -1.65280764],
       [ 1.27721853, -0.03088576, -0.71810906, -0.04517449],
       [-0.07406813,  1.68412382,  0.72055479, -0.67424833],
       ...,
       [ 1.45347331, -0.13718801, -0.11866579,  1.56245866],
       [ 0.51344781, -1.5970722 ,  0.36088883,  0.30431098],
       [ 1.45347331,  1.31560937, -1.43744099,  0.23441388]])

In [91]:
train_data, test_data, train_ids, test_ids = train_test_split(
    features_scaled, customer_ids, test_size=0.2, random_state=42
)

In [92]:
class Autoencoder(nn.Module):
  def __init__(self, input_dim, output_dim):
    super(Autoencoder, self).__init__()
    self.encoder = nn.Sequential(
        nn.Linear(input_dim, hidden_dim),
        nn.ReLU(),
        nn.Linear(hidden_dim, hidden_dim // 2),
        nn.ReLU()

    )
    self.decoder = nn.Sequential(
        nn.Linear(hidden_dim // 2, hidden_dim),
        nn.ReLU(),
        nn.Linear(hidden_dim, input_dim),
        nn.Sigmoid()
    )
  def forward(self, x):
    encoded = self.encoder(x)
    decoded = self.decoder(encoded)
    return decoded

In [93]:
input_dim = train_data.shape[1]
input_dim

4

In [94]:
hidden_dim = 64

In [95]:
model = Autoencoder(input_dim, hidden_dim)
model

Autoencoder(
  (encoder): Sequential(
    (0): Linear(in_features=4, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=32, bias=True)
    (3): ReLU()
  )
  (decoder): Sequential(
    (0): Linear(in_features=32, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=4, bias=True)
    (3): Sigmoid()
  )
)

In [96]:
dataset = TensorDataset(torch.tensor(train_data, dtype=torch.float32))
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [97]:
dataloader

<torch.utils.data.dataloader.DataLoader at 0x7f1e0e649f10>

In [98]:
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [99]:
model.train()
epochs = 50

In [100]:
for epoch in range(epochs):
  total_loss = 0
  for batch in dataloader:
    inputs = batch[0]
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = loss_fn(outputs, inputs)
    loss.backward()
    optimizer.step()
    total_loss += loss.item()
  print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(dataloader):.6f}')

Epoch 1/50, Loss: 0.705776
Epoch 2/50, Loss: 0.540134
Epoch 3/50, Loss: 0.539455
Epoch 4/50, Loss: 0.539326
Epoch 5/50, Loss: 0.539266
Epoch 6/50, Loss: 0.539282
Epoch 7/50, Loss: 0.539191
Epoch 8/50, Loss: 0.539214
Epoch 9/50, Loss: 0.539142
Epoch 10/50, Loss: 0.539115
Epoch 11/50, Loss: 0.539115
Epoch 12/50, Loss: 0.539108
Epoch 13/50, Loss: 0.539063
Epoch 14/50, Loss: 0.539037
Epoch 15/50, Loss: 0.538999
Epoch 16/50, Loss: 0.538892
Epoch 17/50, Loss: 0.538882
Epoch 18/50, Loss: 0.538800
Epoch 19/50, Loss: 0.538732
Epoch 20/50, Loss: 0.538656
Epoch 21/50, Loss: 0.538603
Epoch 22/50, Loss: 0.538555
Epoch 23/50, Loss: 0.538490
Epoch 24/50, Loss: 0.538388
Epoch 25/50, Loss: 0.538260
Epoch 26/50, Loss: 0.538222
Epoch 27/50, Loss: 0.538108
Epoch 28/50, Loss: 0.537937
Epoch 29/50, Loss: 0.537845
Epoch 30/50, Loss: 0.537691
Epoch 31/50, Loss: 0.537597
Epoch 32/50, Loss: 0.537483
Epoch 33/50, Loss: 0.537418
Epoch 34/50, Loss: 0.537309
Epoch 35/50, Loss: 0.537329
Epoch 36/50, Loss: 0.537265
E

In [101]:
model.eval()
with torch.no_grad():
  data_tensor = torch.tensor(test_data, dtype=torch.float32)
  y_hat = model(data_tensor)
  mse = torch.mean((y_hat - data_tensor )** 2, dim=1)
  mse_np = mse.numpy()

Creates a boolean mask (anomaly_mask) where True indicates records with reconstruction errors exceeding 3 standard deviations above the mean of mse_original_scale.

In [102]:
# Inverse transform to original scale for interpretation
mse_original_scale = mse_np * scaler.var_.mean()

# Flag anomalies based on threshold (3 standard deviations)
threshold = 3
anomaly_mask = mse_original_scale > threshold * np.std(mse_original_scale)
anomalies = np.where(anomaly_mask)[0]

# Output results with customer_id
print(f"Detected {len(anomalies)} anomalies in the test data.")
for idx in anomalies:
    print(f"Anomaly at index {idx}, Customer ID: {test_ids[idx]}, Reconstruction Error: {mse_original_scale[idx]:.4f}")

Detected 144 anomalies in the test data.
Anomaly at index 22, Customer ID: 4641, Reconstruction Error: 245.1508
Anomaly at index 38, Customer ID: 3974, Reconstruction Error: 180.8087
Anomaly at index 47, Customer ID: 3466, Reconstruction Error: 290.4696
Anomaly at index 50, Customer ID: 3724, Reconstruction Error: 195.7891
Anomaly at index 58, Customer ID: 2546, Reconstruction Error: 215.2083
Anomaly at index 70, Customer ID: 5324, Reconstruction Error: 265.9928
Anomaly at index 71, Customer ID: 800, Reconstruction Error: 200.6011
Anomaly at index 86, Customer ID: 3306, Reconstruction Error: 217.4302
Anomaly at index 93, Customer ID: 1057, Reconstruction Error: 220.7726
Anomaly at index 109, Customer ID: 2341, Reconstruction Error: 194.8581
Anomaly at index 117, Customer ID: 766, Reconstruction Error: 267.0207
Anomaly at index 154, Customer ID: 5270, Reconstruction Error: 167.8461
Anomaly at index 155, Customer ID: 6898, Reconstruction Error: 204.2284
Anomaly at index 199, Customer ID:

##Possible Reasons for Anomalies:

####**Fraudulent Activity:** High sms_count or num_calls with low data_usage could indicate spam or automated systems.

####**Data Errors:** Incorrectly recorded values (e.g., extremely low data_usage or call_duration).

####**Unique Usage Patterns:** Customers with niche behaviors, like heavy SMS usage but minimal data or calls, which don’t align with typical user profiles.

####**System Issues:** Anomalies could reflect network errors, billing issues, or test accounts.