In [13]:
import pandas as pd
df = pd.read_csv("data/u-churn-train.csv")
df.drop(columns=['customerID'], inplace=True)
df.drop(columns=['PhoneService'], inplace=True)
df.drop(columns=['InternetService'], inplace=True)

In [14]:
# 检查数据发现有10行包含空数据, 都在TotalCharges列
df[df.apply(lambda x: ((x == "") | (x == " ")).any(), axis=1)]

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
4,Male,0,No,Yes,0,Yes,Yes,Yes,No,Yes,No,No,Two year,Yes,Bank transfer (automatic),61.9,,No
282,Female,0,Yes,Yes,0,No phone service,Yes,Yes,Yes,Yes,Yes,No,Two year,No,Credit card (automatic),56.05,,No
2419,Male,0,Yes,Yes,0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.35,,No
2734,Male,0,Yes,Yes,0,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,One year,Yes,Mailed check,19.7,,No
2903,Male,0,Yes,Yes,0,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,19.85,,No
3974,Female,0,Yes,Yes,0,No,Yes,Yes,Yes,No,Yes,Yes,Two year,No,Mailed check,80.85,,No
5023,Male,0,No,Yes,0,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.25,,No
5030,Female,0,Yes,Yes,0,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.0,,No
5343,Female,0,Yes,Yes,0,No phone service,Yes,No,Yes,Yes,Yes,No,Two year,Yes,Bank transfer (automatic),52.55,,No
5599,Male,0,Yes,Yes,0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.75,,No


In [15]:
# 总消费需要先转换为数值类型, 不然留空被识别为string''是不会被检测出空的
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0].sort_values(ascending=False)
missing_values

TotalCharges    10
dtype: int64

In [16]:
# 填写为中位数
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())
df['Churn'] = df['Churn'].map({'Yes': 1.0, 'No': 0.0})


In [17]:
features_all = list(df.columns)
features_cate = df.select_dtypes(include=['object']).columns.tolist()
features_cate.append('Churn') # 手动操作一下Churn

features_numr = [feature for feature in features_all if feature not in features_cate]
features_cate, features_numr

(['gender',
  'Partner',
  'Dependents',
  'MultipleLines',
  'OnlineSecurity',
  'OnlineBackup',
  'DeviceProtection',
  'TechSupport',
  'StreamingTV',
  'StreamingMovies',
  'Contract',
  'PaperlessBilling',
  'PaymentMethod',
  'Churn'],
 ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges'])

In [18]:
from sklearn.preprocessing import StandardScaler
# 初始化 StandardScaler
# scaler = StandardScaler()
# for feature in features_numr:
#     df[feature] = scaler.fit_transform(df[[feature]])

scalers = {}  # 用字典存储每个特征的 scaler, 为了以后的反标准化
for feature in features_numr:
    scalers[feature] = StandardScaler()  # 为每个特征创建 scaler
    df[feature] = scalers[feature].fit_transform(df[[feature]])


In [19]:
df = pd.get_dummies(df, columns=features_cate, drop_first=True)  # drop_first=True 避免多重共线性
# 将 bool 类型转换为 float32
df = df.astype(float)
len(list(df.columns))

28

In [20]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [21]:
# 读取数据
X = df.iloc[:, :-1].values  # 取前30列作为特征
y = df.iloc[:, -1].values   # 取最后1列作为标签

# 转换为 PyTorch Tensor
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32).view(-1, 1)  # 转为列向量

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# 创建 PyTorch Dataset
class MyDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# 创建 DataLoader
train_dataset = MyDataset(X_train, y_train)
test_dataset = MyDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

In [23]:
class BinaryClassifier(nn.Module):
    def __init__(self, input_size):
        super(BinaryClassifier, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 8),
            nn.ReLU(),
            nn.Linear(8, 1),
            nn.Sigmoid()  # Sigmoid 输出概率
        )

    def forward(self, x):
        return self.model(x)

# 初始化模型
model = BinaryClassifier(input_size=27)

In [24]:
# 定义损失函数和优化器
criterion = nn.BCELoss()  # 二分类交叉熵损失
# criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练循环
num_epochs = 50
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}") # 训练进展不错

Epoch [1/50], Loss: 0.7668
Epoch [2/50], Loss: 0.6843
Epoch [3/50], Loss: 0.6127
Epoch [4/50], Loss: 0.5521
Epoch [5/50], Loss: 0.5118
Epoch [6/50], Loss: 0.4855
Epoch [7/50], Loss: 0.4663
Epoch [8/50], Loss: 0.4534
Epoch [9/50], Loss: 0.4469
Epoch [10/50], Loss: 0.4430
Epoch [11/50], Loss: 0.4403
Epoch [12/50], Loss: 0.4396
Epoch [13/50], Loss: 0.4374
Epoch [14/50], Loss: 0.4360
Epoch [15/50], Loss: 0.4343
Epoch [16/50], Loss: 0.4335
Epoch [17/50], Loss: 0.4329
Epoch [18/50], Loss: 0.4318
Epoch [19/50], Loss: 0.4306
Epoch [20/50], Loss: 0.4293
Epoch [21/50], Loss: 0.4295
Epoch [22/50], Loss: 0.4291
Epoch [23/50], Loss: 0.4279
Epoch [24/50], Loss: 0.4274
Epoch [25/50], Loss: 0.4266
Epoch [26/50], Loss: 0.4252
Epoch [27/50], Loss: 0.4261
Epoch [28/50], Loss: 0.4246
Epoch [29/50], Loss: 0.4243
Epoch [30/50], Loss: 0.4237
Epoch [31/50], Loss: 0.4244
Epoch [32/50], Loss: 0.4223
Epoch [33/50], Loss: 0.4233
Epoch [34/50], Loss: 0.4221
Epoch [35/50], Loss: 0.4213
Epoch [36/50], Loss: 0.4209
E

In [25]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        outputs = model(batch_X)
        predicted = (outputs >= 0.5).float()  # 0.5 作为分类阈值
        correct += (predicted == batch_y).sum().item()
        total += batch_y.size(0)

accuracy = correct / total
print(f"Test Accuracy: {accuracy:.4f}") 

Test Accuracy: 0.8121


In [26]:
# 读取测试数据并作答
df = pd.read_csv("data/u-churn-test.csv")
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
dtype: object

In [27]:
missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0].sort_values(ascending=False)
missing_values

TotalCharges    1
dtype: int64

In [28]:
# 填写为中位数
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())
# 分离ID
df_first_col = df.iloc[:, [0]]  # 第一列，保持 DataFrame 格式
df_remaining = df.iloc[:, 1:]   # 剩余列

In [29]:

# 继续用之前的scalar, 保持一致性
for feature in features_numr:
    if feature == 'Churn':
        continue # Churn是标签, 测试数据没有
    df_remaining[feature] = scalers[feature].fit_transform(df_remaining[[feature]])

In [30]:
df_remaining = pd.get_dummies(df_remaining, columns=features_cate[:-1], drop_first=True)  # drop_first=True 避免多重共线性
# 将 bool 类型转换为 float32
df_remaining = df_remaining.astype(float)
len(list(df_remaining.columns))

ValueError: could not convert string to float: 'No'

In [None]:
X = df_remaining.iloc[:].values  
X = torch.tensor(X, dtype=torch.float32)
y = model(X)
y_original = y

In [None]:
# y_original = scalers['Churn'].inverse_transform(y.detach().numpy())

In [None]:
len(y_original)

In [None]:
y_original = y_original.flatten()  # NumPy 版本的展平
churn_values = ['Yes' if val >= 0.5 else 'No' for val in y_original.tolist()]
# 创建新的 DataFrame
df_final = pd.DataFrame({
    'customerID': df_first_col.iloc[:, 0],  # 确保使用第一列的值
    'Churn': churn_values
})
df_final.to_csv('submission.csv', index=False)