In [29]:
import numpy as np
import pandas as pd

df_learning = pd.read_csv("../data-source/dataset.csv")

In [30]:
import jieba

def cutReview(x): return ' '.join(jieba.lcut(str(x), cut_all=False))

df_learning.loc[:, '类型'] = df_learning['类型'].fillna(
    '')
df_learning.loc[:, '交易对方'] = df_learning['交易对方'].fillna(
    '').apply(cutReview)
df_learning.loc[:, '商品名称'] = df_learning['商品名称'].fillna(
    '').apply(cutReview)


# 模型训练

In [31]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from scipy.sparse import hstack

# 去除词频小于2的词
countvec = CountVectorizer(min_df=2)

# 构建特征向量
product_vector = countvec.fit_transform(df_learning['商品名称'])
saler_vector = countvec.fit_transform(df_learning['交易对方'])

# 合并特征向量
feature_vector = hstack((product_vector, saler_vector))

# 特征降维
pca = PCA(n_components=50)
feature_vector_pca = pca.fit_transform(feature_vector.toarray())

In [32]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# 按照 7:3 划分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(
    feature_vector_pca, df_learning['类型'], test_size=0.3)

svm_model = SVC(kernel='rbf', probability=True, random_state=1)
svm_model.fit(x_train, y_train)

y_predict = svm_model.predict(x_test)

# 计算准确度
acc = accuracy_score(y_test, y_predict)
print("训练模型准确率为：{:.2f}%".format(acc * 100))


训练模型准确率为：66.21%


# 预测集填入

In [33]:
df_new = pd.read_csv("../out/merged.csv")

df_predict = df_new.copy()

def cutReview(x): return ' '.join(jieba.lcut(str(x), cut_all=False))

df_predict.loc[:, '交易对方'] = df_predict['交易对方'].fillna(
    '').apply(cutReview)
df_predict.loc[:, '商品名称'] = df_predict['商品名称'].fillna(
    '').apply(cutReview)

df_predict

Unnamed: 0,交易对方,交易时间,交易来源,商品名称,备注,开销,收/支,月度,类型,金额(元)
0,北京 萨莉亚 餐饮 管理 有限公司,2023-04-05 20:13:06,支付宝,萨莉亚 昌平 回龙观 店,,-83.00,支出,2023-04,,83.00
1,物美 超市 ( 沙河店 ),2023-04-07 20:33:47,支付宝,条码 支付,,-55.57,支出,2023-04,,55.57
2,北京 穆香源 牛羊肉,2023-04-08 16:41:12,支付宝,收钱 码 收款,,-46.00,支出,2023-04,,46.00
3,北京 优品 酷卖 科技 有限公司,2023-04-09 16:31:21,支付宝,barPay,,-40.99,支出,2023-04,,40.99
4,iCloud 由云上 贵州 运营,2023-04-10 07:52:28,支付宝,App Store & Apple Music ; Purchases ...,,-21.00,支出,2023-04,,21.00
...,...,...,...,...,...,...,...,...,...,...
102,战狼,2023-07-02 19:24:53,支付宝,收钱 码 收款,,-26.00,支出,2023-07,,26.00
103,愚人,2023-07-02 19:42:54,微信支付,收款 方 备注 : 二维码 收款,,-15.00,支出,2023-07,,15.00
104,高德 打车,2023-07-02 20:23:03,支付宝,高德 地图 打车 订单,,-31.12,支出,2023-07,,31.12
105,沙河店,2023-07-03 20:10:34,支付宝,订单 ： 679602443926,,-54.55,支出,2023-07,,54.55


In [34]:
# 去除词频小于2的词
countvec = CountVectorizer(min_df=2)

# 构建特征向量
product_vector = countvec.fit_transform(df_predict['商品名称'])
saler_vector = countvec.fit_transform(df_predict['交易对方'])

# 合并特征向量
feature_vector = hstack((product_vector, saler_vector))

# 特征降维
pca = PCA(n_components=50)
feature_vector_pca = pca.fit_transform(feature_vector.toarray())

# 结果预测

In [35]:
confidence_threshold = 0.35

y_predict = svm_model.predict(feature_vector_pca)
confidence_scores = svm_model.predict_proba(feature_vector_pca)

# 根据阈值过滤不可信的结果
for i in range(len(y_predict)):
    if np.max(confidence_scores[i]) < confidence_threshold:
        y_predict[i] = ''

df_new['类型'] = y_predict

df_new


Unnamed: 0,交易对方,交易时间,交易来源,商品名称,备注,开销,收/支,月度,类型,金额(元)
0,北京萨莉亚餐饮管理有限公司,2023-04-05 20:13:06,支付宝,萨莉亚昌平回龙观店,,-83.00,支出,2023-04,,83.00
1,物美超市(沙河店),2023-04-07 20:33:47,支付宝,条码支付,,-55.57,支出,2023-04,餐饮,55.57
2,北京穆香源牛羊肉,2023-04-08 16:41:12,支付宝,收钱码收款,,-46.00,支出,2023-04,旅行,46.00
3,北京优品酷卖科技有限公司,2023-04-09 16:31:21,支付宝,barPay,,-40.99,支出,2023-04,,40.99
4,iCloud 由云上贵州运营,2023-04-10 07:52:28,支付宝,App Store & Apple Music; Purchases on 04.10,,-21.00,支出,2023-04,,21.00
...,...,...,...,...,...,...,...,...,...,...
102,战狼,2023-07-02 19:24:53,支付宝,收钱码收款,,-26.00,支出,2023-07,旅行,26.00
103,愚人,2023-07-02 19:42:54,微信支付,收款方备注:二维码收款,,-15.00,支出,2023-07,餐饮,15.00
104,高德打车,2023-07-02 20:23:03,支付宝,高德地图打车订单,,-31.12,支出,2023-07,餐饮,31.12
105,沙河店,2023-07-03 20:10:34,支付宝,订单：679602443926,,-54.55,支出,2023-07,餐饮,54.55


In [36]:
df_new.to_csv("../out/predict_result.csv", index=False, header=True, encoding='utf-8')