In [1]:
from textrank4zh import TextRank4Keyword,TextRank4Sentence
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import jieba.analyse
import os
import pickle
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor

In [2]:
# 获取关键词的函数
def get_keywords(x):
    if pd.isna(x):
        return np.nan
    else:
        # 使用textrank提取关键词前20
        tr4w = TextRank4Keyword(stop_words_file='./base_data/stopword.data')
        tr4w.train(text=x, speech_tag_filter=True, lower=True, window=2)
        keywords_textrank = tr4w.get_keywords(10, word_min_len=2)
        # 依据tfidf提取关键词前20
        keywords_tfidf = jieba.analyse.extract_tags(x)[:10]
        # 取交集，两种方法共同出现的, ，这样的结果更准确
        result_ = list(set(keywords_textrank).intersection(set(keywords_tfidf)))
        return result_

In [3]:
# 读取数据
df = pd.read_csv('./raw_data/datawithcor.csv')
# 去除Narrative y为nan的值
df = df[df.Fatality_Rate.notna()]
df = df[df.Narrative.notna()]

In [4]:
# 提取关键字
df['key_words'] = df.Narrative.apply(lambda x: get_keywords(x))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\86135\AppData\Local\Temp\jieba.cache
Loading model cost 1.580 seconds.
Prefix dict has been built successfully.


In [11]:
# 设置x_columns和y_columns名字
x_columns = ['year', 'Type', 'Operator', 'Lat', 'Long', 'key_words']
y_columns = 'Fatality_Rate'

In [6]:
# 数据预处理
# Lat, Long 取绝对值
df.Lat = df.Lat.abs()
df.Long = df.Long.abs()

In [7]:
# 因为Type 和 Operator只有2类，所以进行0,1标记即可
type_dict = {'Caproni': 0, 'Felixstowe': 1}
operator_dict = {'Caproni': 0, 'Royal Air Force - RAF': 1}
df.Type = df.Type.map(type_dict)
df.Operator = df.Operator.map(operator_dict)

In [8]:
# Narrative提取关键词后，因为数量和单词都不同，所以进行tf-idf
# 数据预处理，将每行的list中用' '连接
df['key_words_new'] = df.key_words.apply(lambda x: ' '.join(x))
# 计算tfidf
tf_train = TfidfVectorizer()
tf_result = tf_train.fit_transform(df.key_words_new.tolist())

In [9]:
# 存储基础数据，以便之后预测
if os.path.exists('./base_data/vocabulary.json'):
    os.remove('./base_data/vocabulary.json')
if os.path.exists('./base_data/idfs.npy'):
    os.remove('./base_data/idfs.npy')
f1 = open('./base_data/vocabulary.json', 'wb')
file = tf_train.vocabulary_
pickle.dump(file,f1)
f1.close()
idfs = tf_train.idf_
np.save('./base_data/idfs.npy',idfs)
# tfidf 加载模型供预测用
# idfs = np.load('./base_data/idfs_con.npy')
# x_list = [[contents_[i],add_info[i],x_list_1[i],x_list_2[i]] for i in range(len(x_list_1))]
# vocabulary_import = pickle.load(open('./base_data/vocabulary_con.json','rb'))
# tf_train = MyVectorizer(vocabulary = vocabulary_import)
# tf_train._tfidf._idf_diag = sp.spdiags(idfs,
#                                      diags = 0,
#                                      m = len(idfs),
#                                      n = len(idfs))

In [12]:
# 将tfidf结果和之前'Type', 'Operator', 'Lat', 'Long'合并
a = df[x_columns[:-1]]
all_feature = np.hstack((a,  tf_result.toarray()))

In [14]:
# 构建模型
# 分训练集和测试机
x_train, x_test, y_train, y_test = train_test_split(all_feature, df[y_columns],  test_size=0.3, random_state=0)

In [15]:
# lgb
gbm = lgb.LGBMRegressor(objective='regression', num_leaves=31, learning_rate=0.05, n_estimators=100)
gbm.fit(x_train, y_train)
y_pre = gbm.predict(x_test)
print('mse: ', mean_squared_error(y_test, y_pre))

mse:  0.1362350917314941


In [None]:
# 其他模型可以自己加