In [1]:
from textrank4zh import TextRank4Keyword,TextRank4Sentence
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import jieba.analyse
import os
import pickle
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
# GET_KEYWORDS
def get_keywords(x):
    if pd.isna(x):
        return np.nan
    else:
        # TEXTRANK TOP 20
        tr4w = TextRank4Keyword(stop_words_file='./base_data/stopword.data')
        tr4w.train(text=x, speech_tag_filter=True, lower=True, window=2)
        keywords_textrank = tr4w.get_keywords(10, word_min_len=2)
        # Top_20 TF_IDF words
        keywords_tfidf = jieba.analyse.extract_tags(x)[:10]
        # Intersection
        result_ = list(set(keywords_textrank).intersection(set(keywords_tfidf)))
        return result_

In [3]:
# READ_DATA
df = pd.read_csv('./raw_data/datawithcor.csv')
# DROP NAN
df = df[df.Fatality_Rate.notna()]
df = df[df.Narrative.notna()]

In [4]:
# KEYWORDS EXTRACTION
df['key_words'] = df.Narrative.apply(lambda x: get_keywords(x))

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/pj/mrvgnnps0n92zyj8t94f613r0000gn/T/jieba.cache
Loading model cost 0.747 seconds.
Prefix dict has been built successfully.


In [5]:
# SET X and Y
x_columns = ['year', 'Type', 'Operator', 'Lat', 'Long', 'key_words']
y_columns = 'Fatality_Rate'

In [6]:
# PRE_TRAIN
# Lat, Long ABS
df.Lat = df.Lat.abs()
df.Long = df.Long.abs()

In [7]:
# Order Map
type_dict = {'Caproni': 0, 'Felixstowe': 1}
operator_dict = {'Caproni': 0, 'Royal Air Force - RAF': 1}
df.Type = df.Type.map(type_dict)
df.Operator = df.Operator.map(operator_dict)

In [8]:
# TF_IDF
# 'Pre_Train, join null intro words
df['key_words_new'] = df.key_words.apply(lambda x: ' '.join(x))
# tfidf
tf_train = TfidfVectorizer()
tf_result = tf_train.fit_transform(df.key_words_new.tolist())

In [9]:

if os.path.exists('./base_data/vocabulary.json'):
    os.remove('./base_data/vocabulary.json')
if os.path.exists('./base_data/idfs.npy'):
    os.remove('./base_data/idfs.npy')
f1 = open('./base_data/vocabulary.json', 'wb')
file = tf_train.vocabulary_
pickle.dump(file,f1)
f1.close()
idfs = tf_train.idf_
np.save('./base_data/idfs.npy',idfs)
# 
# idfs = np.load('./base_data/idfs_con.npy')
# x_list = [[contents_[i],add_info[i],x_list_1[i],x_list_2[i]] for i in range(len(x_list_1))]
# vocabulary_import = pickle.load(open('./base_data/vocabulary_con.json','rb'))
# tf_train = MyVectorizer(vocabulary = vocabulary_import)
# tf_train._tfidf._idf_diag = sp.spdiags(idfs,
#                                      diags = 0,
#                                      m = len(idfs),
#                                      n = len(idfs))

In [10]:
# 'Merge TD_IDF'
a = df[x_columns[:-1]]
all_feature = np.hstack((a,  tf_result.toarray()))

In [11]:
# Model Building
# Set split
x_train, x_test, y_train, y_test = train_test_split(all_feature, df[y_columns],  test_size=0.3, random_state=0)

In [12]:
# lgb
gbm = lgb.LGBMRegressor(objective='regression', num_leaves=31, learning_rate=0.05, n_estimators=100)
gbm.fit(x_train, y_train)
y_pre = gbm.predict(x_test)
print('mse: ', mean_squared_error(y_test, y_pre))

mse:  0.1362350917314941


In [20]:
import xgboost as xgb
bat =  xgb.XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.05, max_delta_step=0,
             max_depth=5, min_child_weight=3, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:logistic', random_state=0,
             reg_alpha=0.8, reg_lambda=0.5, scale_pos_weight=1, seed=4396,
             silent=None, subsample=0.8,verbosity=1)
bat.fit(x_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.05, max_delta_step=0,
             max_depth=5, min_child_weight=3, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:logistic', random_state=0,
             reg_alpha=0.8, reg_lambda=0.5, scale_pos_weight=1, seed=4396,
             silent=None, subsample=0.8, verbosity=1)

In [28]:
mean_squared_error(bat.predict(x_test),y_test)
np.mean(x_test)

nan

In [35]:
np.mean(y_test)
np.mean(y_train)

0.44351799921360724