In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
# 导入必要的库
from pandas import read_csv, concat
from dateutil.parser import parse
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Dropout
from sklearn.preprocessing import MinMaxScaler
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit, ParameterGrid
from scipy.interpolate import CubicSpline
from tensorflow.keras.callbacks import EarlyStopping
from keras.preprocessing.sequence import TimeseriesGenerator
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import MeanAbsolutePercentageError
import seaborn as sns
from dateutil.relativedelta import relativedelta
from scipy.optimize import minimize
import statsmodels.formula.api as smf
import statsmodels.tsa.api as smt
import statsmodels.api as sm
import scipy.stats as scs
from itertools import product
from tqdm import tqdm
from sklearn.metrics import r2_score, median_absolute_error, mean_absolute_error, mean_squared_error, mean_squared_log_error

# 配置 Matplotlib 正常显示中文标签和负号
plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号

# 读取新能源汽车销量数据
df_sale = pd.read_excel('E:\\qjy\\ecnu\\毕业论文\\新能源汽车数据\\中国-新能源汽车数据-已处理\\中国新能源汽车销量及其影响因素-已合并.xlsx', index_col=0)
df_sale.head()

# 计算相关系数并筛选相关性较高的变量
correlation_matrix = df_sale.corr()
relevant_vars = correlation_matrix.loc[:, 'nev_sale'][abs(correlation_matrix['nev_sale']) > 0.5].index
df_sale1 = df_sale[relevant_vars]

# 样条插值函数
def spline_interpolation(series):
    known_dates = series.dropna().index.to_julian_date()
    known_values = series.dropna().values
    cubic_spline = CubicSpline(known_dates, known_values)
    interpolated_values = cubic_spline(series.index.to_julian_date())
    return interpolated_values

# 对每列进行插值处理
for column in df_sale1.columns:
    df_sale1[column] = spline_interpolation(df_sale1[column])

# 去除充电桩数量缺失的行，从2016年1月开始
df_sale2 = df_sale1[24:]

# 绘制变量的折线图
for column in df_sale2.columns:
    plt.figure(figsize=(10, 6))
    plt.plot(df_sale2.index, df_sale2[column], label=column)
    plt.title('{}图'.format(column))
    plt.xlabel('时间')
    plt.ylabel('值')
    plt.legend()
    plt.show()

# 创建X和Y数据集的函数
def createXY(dataset, look_back=12):
    X, Y = [], []
    for i in range(len(dataset) - look_back - 11):
        a = dataset[i:(i+look_back), :]
        b = dataset[(i+look_back):(i+look_back+12), 0]
        X.append(a)
        Y.append(b)
    return np.array(X), np.array(Y)

# 参数网格
param_grid = {
    'neurons': [5, 10, 15, 20],
    'batch_size': [8, 16, 32],
    'epochs': [100, 200, 500],
    'learn_rate': [0.001, 0.01],
    'drop_out': [0.1, 0.2]
}

# 创建参数网格
grid = ParameterGrid(param_grid)

# 存储每次迭代的结果
results = []

# 逐一遍历参数网格并训练模型
for params in grid:
    for repetition in range(10):
        print(f'Current params: {params}, repetition {repetition + 1}/10')

        # 划分训练集和测试集
        df_for_training = df_sale2[:-12]
        df_for_testing = df_sale2[-12:]
        df_for_testing1 = df_sale2[-12-12:]

        # 归一化
        scaler = MinMaxScaler(feature_range=(0, 1))
        df_for_training_scaled = scaler.fit_transform(df_for_training)
        df_for_testing_scaled = scaler.transform(df_for_testing)
        df_for_testing_scaled1 = scaler.transform(df_for_testing1)

        trainX, trainY = createXY(df_for_training_scaled, 12)
        testX, testY = createXY(df_for_testing_scaled, 12)
        testX1, testY1 = createXY(df_for_testing_scaled1, 12)

        # 构建 LSTM 模型
        model = Sequential([
            LSTM(params['neurons'], activation='relu', return_sequences=True, input_shape=(12, 12)),
            LSTM(params['neurons']),
            Dropout(params['drop_out']),
            Dense(12)
        ])

        # 编译模型
        model.compile(optimizer=Adam(learning_rate=params['learn_rate']),
                      loss='mean_squared_error',
                      metrics=[MeanAbsolutePercentageError()])

        # 训练模型
        model.fit(trainX, trainY, epochs=params['epochs'], batch_size=params['batch_size'], verbose=0)

        # 测试模型
        prediction = model.predict(testX1).reshape(12, 1)
        prediction_copies_array = np.repeat(prediction, 12, axis=-1)
        pred = scaler.inverse_transform(prediction_copies_array)[:, 0]

        df_for_testing['Predictions'] = pred
        mape = np.mean(np.abs((df_for_testing['nev_sale'] - df_for_testing['Predictions'])) / df_for_testing['nev_sale']) * 100

        # 将当前迭代的结果添加到结果列表中
        results.append({
            'repetition': repetition + 1,
            'neurons': params['neurons'],
            'batch_size': params['batch_size'],
            'epochs': params['epochs'],
            'learn_rate': params['learn_rate'],
            'drop_out': params['drop_out'],
            'MAPE': mape
        })

        print(f'Parameters: {params}')
        print(f'Test MAPE: {mape}')

# 将结果列表转换为 DataFrame 并保存
results_df = pd.DataFrame(results)
results_df.to_excel('毕业论文数据分析结果//LSTM_MAPE.xlsx', index=False)

# 重新定义模型并使用最优参数训练
scaler = MinMaxScaler(feature_range=(0, 1))
df_scaled = scaler.fit_transform(df_sale2)

trainX, trainY = createXY(df_scaled, 12)

model = Sequential([
    LSTM(20, activation='relu', return_sequences=True, input_shape=(12, 12)),
    LSTM(20),
    Dropout(0.2),
    Dense(12)
])

model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

# 使用全部训练数据重新训练模型
model.fit(trainX, trainY, epochs=200, batch_size=16, verbose=0)

# 对所有数据进行预测
df_pred = df_scaled[-12:]
predX = df_pred.reshape(1, 12, 12)
all_prediction = model.predict(predX).reshape(12, 1)
all_prediction_copies_array = np.repeat(all_prediction, trainX.shape[2], axis=-1)
predicted_sales = scaler.inverse_transform(all_prediction_copies_array)[:, 0]

df_nev_sale = df_sale2[['nev_sale']]

# 生成 2024 年的日期索引并添加预测数据
dates_2024 = pd.date_range(start='2024-01-01', periods=12, freq='M')
df_predicted = pd.DataFrame(data=predicted_sales, index=dates_2024, columns=['nev_sale'])
df_full = pd.concat([df_nev_sale, df_predicted])

# 绘制折线图
plt.figure(figsize=(14, 7))
plt.plot(df_full.index, df_full['nev_sale'], label='Historical and Predicted Sales', color='blue')
plt.plot(df_predicted.index, df_predicted['nev_sale'], label='Predicted Sales 2024', color='red', linestyle='--')
plt.title('Monthly NEV Sales from 2016 to 2024')
plt.xlabel('Year')
plt.ylabel('NEV Sales')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 保存结果并计算 LSTM 变量重要性
# 设置 LSTM 特征重要性计算的相关参数
COMPUTE_LSTM_IMPORTANCE = 1
ONE_FOLD_ONLY = 1
NUM_FOLDS = 10

# 使用 GPU 进行训练
gpu_strategy = tf.distribute.get_strategy()
COLS = list(df_for_training.columns)  # 获取训练数据的列名

# 在 GPU 环境下进行 K 折交叉验证
with gpu_strategy.scope():
    kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=2021)
    for fold, (train_idx, test_idx) in enumerate(kf.split(trainX, trainY)):
        tf.keras.backend.clear_session()  # 清理会话，防止模型重复加载冲突

        print('-'*15, '>', f'Fold {fold+1}', '<', '-'*15)

        # 划分训练集和验证集
        X_train, X_valid = trainX[train_idx], trainX[test_idx]
        y_train, y_valid = trainY[train_idx], trainY[test_idx]

        # 导入已经训练好的模型
        model = keras.models.load_model('C:\\Users\\lenovo\\Desktop\\lstm.h5')

        # 计算特征重要性
        if COMPUTE_LSTM_IMPORTANCE:
            results = []
            print(' Computing LSTM feature importance...')

            # 遍历每个特征列，打乱数据后计算模型性能
            for k in tqdm(range(len(COLS))):
                if k > 0:
                    save_col = X_valid[:, :, k-1].copy()  # 保存原始列
                    np.random.shuffle(X_valid[:, :, k-1])  # 随机打乱列

                # 预测验证集
                oof_preds = model.predict(X_valid, verbose=0).squeeze()
                
                # 计算 MAPE (Mean Absolute Percentage Error)
                mape = np.mean(np.abs((oof_preds - y_valid) / y_valid)) * 100
                results.append({'feature': COLS[k], 'mape': mape})

                # 恢复被打乱的列
                if k > 0:
                    X_valid[:, :, k-1] = save_col

            # 展示特征重要性
            print()
            df = pd.DataFrame(results)
            df = df.sort_values('mape')
            plt.figure(figsize=(8, 8))
            plt.barh(np.arange(len(COLS)), df.mape)
            plt.yticks(np.arange(len(COLS)), df.feature.values)
            plt.title('LSTM Feature Importance', size=16)
            plt.ylim((-1, len(COLS)))
            plt.show()

            # 保存 LSTM 特征重要性到 CSV 文件
            df = df.sort_values('mape', ascending=False)
            df.to_csv(f'毕业论文数据分析结果/lstm_feature_importance_fold_mape_{fold}.csv', index=False)

        # 只进行一次折叠
        if ONE_FOLD_ONLY:
            break

