In [None]:
## 逻辑回归实现二分类问题

In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
#from sklearn.ensemble import GradientBoostingClassifier
import lightgbm as lgb
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

# 1. 数据加载和预处理
df = pd.read_csv('../mulit_task_data.csv')

# 2. 特征工程
# 对分类特征进行编码
label_encoders = {}
categorical_cols = ['gender', 'region', 'user_hot', 'video_type', 'video_quality']
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le
# 准备特征和目标变量
features = ['gender', 'age', 'region', 'user_hot', 'video_type', 
           'video_length', 'video_quality', 'is_exceed_5s', 'stay_time']
X = df[features]
y = df['is_watch']

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. 逻辑回归模型
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)

print("逻辑回归模型评估结果：")
print(classification_report(y_test, lr_pred))

逻辑回归模型评估结果：
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         8
           1       1.00      1.00      1.00        25

    accuracy                           1.00        33
   macro avg       1.00      1.00      1.00        33
weighted avg       1.00      1.00      1.00        33



In [22]:
df

Unnamed: 0,user_id,gender,age,region,user_hot,video_id,video_type,video_length,video_quality,is_exceed_5s,stay_time,is_watch,is_buy
0,1,1,25,1,2,1001,0,120,1,1,120,1,0
1,2,0,30,0,0,1002,1,180,1,1,150,1,1
2,3,1,35,2,1,1003,3,90,0,0,10,0,0
3,4,0,28,4,2,1004,4,150,1,1,160,1,0
4,5,1,22,3,0,1005,2,210,1,1,200,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
160,161,1,33,4,2,1161,4,155,1,1,165,1,0
161,162,0,29,3,0,1162,2,215,1,1,205,1,1
162,163,1,34,1,2,1163,0,135,1,1,145,1,1
163,164,0,39,0,0,1164,1,195,1,1,175,1,0


In [23]:
# 保存模型
import joblib
import pickle

# 保存逻辑回归模型
joblib.dump(lr_model, './models/logistic_regression_model.pkl')

# 保存标签编码器
# 保存标签编码器
with open('models/label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)

In [None]:
# 创建LightGBM数据集
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

# 设置参数
params = {
    'objective': 'binary',  # 二分类任务
    'metric': 'binary_logloss',  # 二分类的损失函数
    'boosting_type': 'gbdt',
    'num_leaves': 31,  # 叶子节点数
    'learning_rate': 0.05,
    'feature_fraction': 0.9  # 建树的特征选择比例
}
## 训练模型
lgb_model = lgb.train(
    params,
    lgb_train,
    num_boost_round=100,  # 迭代次数
    valid_sets=[lgb_train, lgb_eval],
    callbacks=[lgb.early_stopping(stopping_rounds=10)]
)

# 预测
lgb_pred = (lgb_model.predict(X_test) > 0.5).astype(int)

print("\nLightGBM模型评估结果：")
print(classification_report(y_test, lgb_pred))

# 5. 特征重要性分析（LightGBM）
feature_importance = pd.DataFrame({
    'feature': features,
    'importance': lgb_model.feature_importance()
})
print("\nLightGBM特征重要性：")
print(feature_importance.sort_values('importance', ascending=False))

In [6]:
## 保存模型
# 保存LightGBM模型
lgb_model.save_model('./models/lightgbm_model.txt')

<lightgbm.basic.Booster at 0x15b040410>

In [7]:
# 保存模型
import joblib
import pickle

# 加载模型并进行推理
def preprocess_data(data, label_encoders):
    """预处理输入数据"""
    for col in categorical_cols:
        if col in data:
            data[col] = label_encoders[col].transform(data[col])
    return data[features]

def predict_watch(data, model_type='lr'):
    """使用指定模型进行预测"""
    # 加载标签编码器
    with open('./models/label_encoders.pkl', 'rb') as f:
        label_encoders = pickle.load(f)
    
    # 预处理数据
    processed_data = preprocess_data(data.copy(), label_encoders)
    
    # 根据模型类型选择相应的模型进行预测
    if model_type.lower() == 'lr':
        model = joblib.load('./models/logistic_regression_model.pkl')
        #method： predict_proba
        pred = model.predict(processed_data)
    elif model_type.lower() == 'lgb':
        model = lgb.Booster(model_file='./models/lightgbm_model.txt')
        pred = (model.predict(processed_data) > 0.5).astype(int)
    else:
        raise ValueError("Unsupported model type. Use 'lr' or 'lgb'.")
    
    return pred

In [34]:
## 进行推理
# 示例：使用样例数据进行推理
sample_data = pd.DataFrame({
    'gender': ['男', '女', '男'],
    'age': [25, 30, 35],
    'region': ['北京', '上海', '广州'],
    'user_hot': ['高', '中', '低'],
    'video_type': ['娱乐', '教育', '科技'],
    'video_length': [120, 180, 90],
    'video_quality': ['高清', '高清', '标清'],
    'is_exceed_5s': [1, 1, 0],
    'stay_time': [120, 150, 10]
})

# 使用逻辑回归模型预测
lr_predictions = predict_watch(sample_data, 'lr')
print("\n逻辑回归模型预测结果：")
print(lr_predictions)

# 使用LightGBM模型预测
lgb_predictions = predict_watch(sample_data, 'lgb')
print("\nLightGBM模型预测结果：")
print(lgb_predictions)

# 将预测结果添加到样例数据中展示
sample_data['LR_Prediction'] = lr_predictions
sample_data['LGB_Prediction'] = lgb_predictions
print("\n预测结果详情：")
print(sample_data)


逻辑回归模型预测结果：
[1 1 0]

LightGBM模型预测结果：
[1 1 0]

预测结果详情：
  gender  age region user_hot video_type  video_length video_quality  \
0      男   25     北京        高         娱乐           120            高清   
1      女   30     上海        中         教育           180            高清   
2      男   35     广州        低         科技            90            标清   

   is_exceed_5s  stay_time  LR_Prediction  LGB_Prediction  
0             1        120              1               1  
1             1        150              1               1  
2             0         10              0               0  


In [9]:
sample_data = pd.DataFrame({
    'gender': ['男', '女', '男'],
    'age': [25, 30, 35],
    'region': ['北京', '上海', '广州'],
    'user_hot': ['高', '中', '低'],
    'video_type': ['娱乐', '教育', '科技'],
    'video_length': [120, 180, 90],
    'video_quality': ['高清', '高清', '标清'],
    'is_exceed_5s': [1, 1, 0],
    'stay_time': [120, 150, 10]
})
a= sample_data.copy()

In [10]:
a

Unnamed: 0,gender,age,region,user_hot,video_type,video_length,video_quality,is_exceed_5s,stay_time
0,男,25,北京,高,娱乐,120,高清,1,120
1,女,30,上海,中,教育,180,高清,1,150
2,男,35,广州,低,科技,90,标清,0,10


In [24]:
with open('./models/label_encoders.pkl', 'rb') as f:
    label_encoders = pickle.load(f)

In [12]:
categorical_cols

['gender', 'region', 'user_hot', 'video_type', 'video_quality']

In [25]:
label_encoders['gender'].transform(a['gender'])

array([1, 0, 1])

In [None]:
def preprocess_data(data, label_encoders):
    """预处理输入数据"""
    for col in categorical_cols:
        if col in data:
            data[col] = label_encoders[col].transform(data[col])
    return data[features]

In [33]:
preprocess_data(a,label_encoders)

Unnamed: 0,gender,age,region,user_hot,video_type,video_length,video_quality,is_exceed_5s,stay_time
0,1,25,1,2,0,120,1,1,120
1,0,30,0,0,1,180,1,1,150
2,1,35,2,1,3,90,0,0,10


In [30]:
label_encoders['gender'].transform(['男','女'])

array([1, 0])

In [31]:
a['gender']

0    男
1    女
2    男
Name: gender, dtype: object

In [32]:
label_encoders['gender'].transform(a['gender'])

array([1, 0, 1])