In [111]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['font.family'] = 'Microsoft YaHei'
plt.rcParams['axes.unicode_minus'] = False

df_bookkeep = pd.read_csv("../data-source/dataset.csv")
df_bookkeep['交易时间'] = pd.to_datetime(df_bookkeep['交易时间'], format='%Y/%m/%d %H:%M:%S')
df_bookkeep['金额(元)'] = pd.to_numeric(df_bookkeep['金额(元)'])

df_bookkeep['月度'] = df_bookkeep['交易时间'].dt.strftime('%Y-%m')

def process_amount(row):
    amount = row['金额(元)']
    if row['收/支'] == '收入':
        return amount
    else:
        return -amount

df_bookkeep['开销'] = df_bookkeep.apply(process_amount, axis=1)

df_bookkeep.head()


Unnamed: 0,交易时间,类型,金额(元),收/支,交易来源,交易对方,商品名称,备注,月度,开销
0,2018-07-12 11:04:00,交通,368.5,支出,支付宝,中国铁路网络有限公司,火车票,,2018-07,-368.5
1,2018-07-17 23:32:00,住房,1000.0,支出,微信支付,布丁奶奶,转账备注:房租押金,,2018-07,-1000.0
2,2018-07-19 07:07:00,住房,1000.0,收入,微信支付,/,转账备注:房租押金,,2018-07,1000.0
3,2018-07-19 14:07:00,交通,50.0,支出,支付宝,易办事深圳通充值,易办事深圳通充值,,2018-07,-50.0
4,2018-07-19 17:12:00,住房,6500.0,支出,微信支付,布丁奶奶,收款方备注:二维码收款,,2018-07,-6500.0


In [112]:
table_payment = pd.pivot_table(df_bookkeep, index='月度', columns='类型', values='开销', aggfunc='sum')
table_payment.fillna(0, inplace=True)

In [113]:
table_payment["月结余"] = table_payment.sum(axis=1)
table_payment["月支出"] = table_payment["月结余"] - table_payment['工资'] - table_payment['交易']

table_payment.head()

类型,交易,交通,人情,住房,娱乐,工资,旅行,生活,礼物,餐饮,月结余,月支出
月度,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2018-07,0.0,-447.51,-229.17,-8859.0,-5.99,0.0,0.0,-79.9,0.0,-129.5,-9751.07,-9751.07
2018-08,0.0,-40.28,-117.92,-2177.41,-50.0,11826.03,0.0,-336.17,0.0,-799.49,8304.76,-3521.27
2018-09,0.0,-801.6,-61.73,-2100.0,-407.5,11291.75,-190.0,-374.32,-5438.0,-1000.61,917.99,-10373.76
2018-10,0.0,-259.6,204.19,-2138.5,-199.0,12004.1,-5404.03,-92.56,-107.61,-451.4,3555.59,-8448.51
2018-11,0.0,-60.95,1991.6,-2138.5,-5577.2,12004.1,0.0,-518.5,-1611.0,-922.94,3166.61,-8837.49


In [114]:
# 定义应用于总支出列的样式函数
def color_render(val):
    colors = np.where(val >= -5000, 'color: green', np.where(val <= -7000, 'color: red', ''))
    return colors

table_styled = table_payment.style.apply(color_render, subset=["月支出"])

In [115]:
plt.figure(figsize=(20, 10))
type_result = -(table_payment.drop(["工资","月结余", "月支出", "交易"], axis=1).sum(axis=0))

# type_result.plot(kind="bar")

<Figure size 1440x720 with 0 Axes>

In [116]:
plt.figure(figsize=(30, 10))
monthly_result = abs(table_payment["月支出"])

# monthly_result.plot(kind="line")


<Figure size 2160x720 with 0 Axes>

In [117]:
df_learning = df_bookkeep[['交易对方', '商品名称', '类型']]

df_learning.tail(10)

Unnamed: 0,交易对方,商品名称,类型
5334,深圳市地铁相关运营主体,深圳地铁,交通
5335,铁路12306,火车票,交通
5336,李氏冰室,收钱码收款,餐饮
5337,桂阳县第一人民医院,门诊缴费,交通
5338,桂阳县第一人民医院,门诊缴费,生活
5339,单国廷,759公寓退押金,住房
5340,独一佳早餐店,收钱码收款,餐饮
5341,桂阳县华尔学府生活超市,桂阳县华尔学府生活超市,餐饮
5342,铁路12306,火车票,交通
5343,株洲市中心医院,株洲市中心医院,生活


In [118]:
# 分词和预处理
import jieba


def cutReview(x): return ' '.join(jieba.lcut(str(x), cut_all=False))

df_learning.loc[:, '交易对方'] = df_learning['交易对方'].fillna('').apply(cutReview)
df_learning.loc[:, '商品名称'] = df_learning['商品名称'].fillna('').apply(cutReview)

df_learning


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_learning.loc[:, '交易对方'] = df_learning['交易对方'].fillna('').apply(cutReview)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_learning.loc[:, '商品名称'] = df_learning['商品名称'].fillna('').apply(cutReview)


Unnamed: 0,交易对方,商品名称,类型
0,中国 铁路 网络 有限公司,火车票,交通
1,布丁 奶奶,转账 备注 : 房租 押金,住房
2,/,转账 备注 : 房租 押金,住房
3,易 办事 深圳 通 充值,易 办事 深圳 通 充值,交通
4,布丁 奶奶,收款 方 备注 : 二维码 收款,住房
...,...,...,...
5339,单国廷,759 公寓 退 押金,住房
5340,独一 佳 早餐 店,收钱 码 收款,餐饮
5341,桂阳县 华尔 学府 生活 超市,桂阳县 华尔 学府 生活 超市,餐饮
5342,铁路 12306,火车票,交通


In [131]:
from sklearn.feature_extraction.text import CountVectorizer

# 去除词频小于2的词
countvec = CountVectorizer(min_df=2)

# 构建特征向量矩阵
Wmatrix = countvec.fit_transform(df_learning['商品名称'])

# print(Wmatrix)

  (0, 1032)	1
  (1, 1374)	1
  (1, 537)	1
  (1, 752)	1
  (1, 780)	1
  (2, 1374)	1
  (2, 537)	1
  (2, 752)	1
  (2, 780)	1
  (3, 991)	1
  (3, 326)	1
  (4, 537)	1
  (4, 803)	2
  (4, 269)	1
  (5, 497)	1
  (6, 537)	1
  (6, 803)	2
  (6, 269)	1
  (7, 616)	1
  (7, 1489)	1
  (7, 480)	1
  (7, 1333)	1
  (7, 908)	1
  (8, 280)	1
  (9, 1092)	1
  :	:
  (5331, 765)	1
  (5332, 1374)	2
  (5332, 537)	1
  (5332, 717)	1
  (5333, 537)	1
  (5333, 803)	2
  (5333, 269)	1
  (5334, 991)	1
  (5334, 523)	1
  (5335, 1032)	1
  (5336, 803)	1
  (5336, 805)	1
  (5337, 1220)	1
  (5337, 1441)	1
  (5338, 1220)	1
  (5338, 1441)	1
  (5339, 780)	1
  (5339, 348)	1
  (5339, 115)	1
  (5340, 803)	1
  (5340, 805)	1
  (5341, 1361)	1
  (5341, 1089)	1
  (5342, 1032)	1
  (5343, 910)	1


In [132]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    Wmatrix, df_learning['类型'], test_size=0.3)


In [133]:
from sklearn.svm import SVC

svm_model = SVC(kernel='rbf', verbose=True)
svm_model.fit(x_train, y_train)

y_predict = svm_model.predict(x_test)

print("准确率：", svm_model.score(x_test, y_test))
y_predict

ValueError: Input contains NaN