## 年级中影响叙述能力的分析

In [7]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr

### 加载数据

In [8]:
## 加载未融合特征的全部特征表
full_data = pd.read_csv('./pre_data/全部特征合并表.csv')

full_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 36 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   文本编号                 300 non-null    object 
 1   单词数量-DESWC-03        300 non-null    int64  
 2   句子数量-DESSC-02        300 non-null    int64  
 3   年级                   300 non-null    int64  
 4   多样性-LDVOCDa（51）      300 non-null    float64
 5   密度-WRDFRQC（94）       300 non-null    float64
 6   复杂度-K1               300 non-null    float64
 7   复杂度-K2               300 non-null    float64
 8   复杂度-K3               300 non-null    float64
 9   复杂度-K4               300 non-null    float64
 10  复杂度-K5               300 non-null    float64
 11  复杂度-K6+              300 non-null    float64
 12  短语长度-SYNLE（69）       300 non-null    float64
 13  短语长度-SYNNP（70）       300 non-null    float64
 14  短语密度-DRNP（76）        300 non-null    float64
 15  短语密度-DRVP（77）        300 non-null    flo

In [9]:
### 加载融合特征的全部特征表
import pandas as pd

full_data_combined = pd.read_csv('./tmp/全部特征合并表_融合.csv')

# 用每一列的均值填充该列的空值(不包括文本编号列)
for col in full_data_combined.columns:
    if col != '文本编号':
        full_data_combined[col].fillna(full_data_combined[col].mean(), inplace=True)

full_data_combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   文本编号                    300 non-null    object 
 1   单词数量-DESWC-03           300 non-null    float64
 2   句子数量-DESSC-02           300 non-null    float64
 3   年级                      300 non-null    int64  
 4   多样性-LDVOCDa（51）         300 non-null    float64
 5   密度-WRDFRQC（94）          300 non-null    float64
 6   复杂度-K                   300 non-null    float64
 7   句子长度-DESSL（06）          300 non-null    float64
 8   短语长度-PhraseLength       300 non-null    float64
 9   短语密度-PhraseDensity      300 non-null    float64
 10  句子结构-SentenceStructure  300 non-null    float64
 11  句法相似-SYNSTRUT           300 non-null    float64
 12  局部连贯-LocalCoherence     300 non-null    float64
 13  整体连贯-GlobalCoherence    300 non-null    float64
 14  因果衔接-SMCAUSr-64         300 non-null    fl

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  full_data_combined[col].fillna(full_data_combined[col].mean(), inplace=True)


### 数据预处理

In [10]:
def analyze_key_factors(df):    
    # 分离特征和目标变量
    feature_cols = [col for col in df.columns if col not in ['文本编号', '年级']]
    X = df[feature_cols]
    y = df['年级']  # 使用年级作为叙事能力的代理变量
    
    # 标准化特征
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # 2. 特征重要性分析
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_scaled, y)
    
    # 获取特征重要性
    feature_importance = pd.DataFrame({
        'feature': feature_cols,
        'importance': rf.feature_importances_
    }).sort_values('importance', ascending=False)
    
    # 3. 相关性分析
    correlations = []
    for feature in feature_cols:
        corr, p_value = pearsonr(df[feature], df['年级'])#皮尔逊相关系数
        correlations.append({
            'feature': feature,
            'correlation': corr,
            'p_value': p_value
        })
    
    corr_df = pd.DataFrame(correlations)
    
    # 4. 综合评分
    # 结合特征重要性和相关性
    merged_df = feature_importance.merge(corr_df, on='feature')
    merged_df['composite_score'] = (
        merged_df['importance'] * 0.6 + 
        abs(merged_df['correlation']) * 0.4
    )
    merged_df = merged_df.sort_values('composite_score', ascending=False)
    
    return merged_df, feature_importance, corr_df

In [11]:
# 可视化结果
def visualize_results(merged_df, feature_importance):
    plt.rcParams['font.sans-serif'] = ['PingFang HK']
    plt.rcParams['axes.unicode_minus'] = False       
    plt.rcParams['font.family'] = 'PingFang HK'
    # 绘制特征重要性图
    plt.figure(figsize=(12, 8))
    
    # 前15个最重要的特征
    top_features = merged_df.head(15)
    
    plt.subplot(2, 1, 1)
    plt.barh(range(len(top_features)), top_features['composite_score'])
    plt.yticks(range(len(top_features)), top_features['feature'])
    plt.xlabel('综合评分')
    plt.title('叙事能力关键要素排序')
    
    # 相关性热力图
    plt.subplot(2, 1, 2)
    correlation_matrix = top_features[['importance', 'correlation', 'composite_score']]
    sns.heatmap(correlation_matrix.T, annot=True, cmap='coolwarm', center=0)
    plt.title('关键要素评分相关性')
    
    plt.tight_layout()
    plt.show()
    
    return top_features

In [19]:
# 可视化结果（使用plotly实现，布局美观专业）
import plotly.graph_objs as go
import plotly.subplots as sp

def visualize_results_plotly(merged_df, feature_importance):
    # 取前15个最重要的特征
    top_features = merged_df.head(15)
    
    # 1. 综合评分条形图
    bar_trace = go.Bar(
        x=top_features['composite_score'][::-1],  # 反转顺序，分数高的在上
        y=top_features['feature'][::-1],
        orientation='h',
        marker=dict(
            color=top_features['composite_score'][::-1],
            colorscale='Blues',
            line=dict(color='rgba(58,71,80,0.8)', width=1.5)
        ),
        text=top_features['composite_score'][::-1].round(3),
        textposition='auto',
        name='综合评分'
    )

    # 2. 相关性热力图
    correlation_matrix = top_features[['importance', 'correlation', 'composite_score']].T
    heatmap_trace = go.Heatmap(
        z=correlation_matrix.values,
        x=top_features['feature'],
        y=['重要性', '相关性', '综合评分'],
        colorscale='RdBu',
        colorbar=dict(title='数值'),
        zmid=0
    )

    # 3. 使用plotly的子图布局
    fig = sp.make_subplots(
        rows=2, cols=1,
        row_heights=[0.6, 0.4],
        vertical_spacing=0.12,
        subplot_titles=("叙事能力关键要素排序（综合评分）", "关键要素评分相关性热力图")
    )

    fig.add_trace(bar_trace, row=1, col=1)
    fig.add_trace(heatmap_trace, row=2, col=1)

    # 美化布局
    fig.update_layout(
        height=800,
        width=1100,
        showlegend=False,
        font=dict(family="PingFang SC, Microsoft YaHei, Arial", size=14),
        margin=dict(l=80, r=40, t=80, b=60),
        plot_bgcolor='white',
        paper_bgcolor='white'
    )
    #fig.update_xaxes(title_text="综合评分", row=1, col=1)
    fig.update_yaxes(title_text="特征", row=1, col=1)
    fig.update_xaxes(showgrid=False, row=1, col=1)
    fig.update_yaxes(showgrid=False, row=1, col=1)
    fig.update_xaxes(title_text="", row=2, col=1)
    fig.update_yaxes(title_text="", row=2, col=1)

    fig.show()

    return top_features

In [12]:
merged_df, feature_importance, corr_df = analyze_key_factors(full_data_combined)

In [14]:
merged_df

Unnamed: 0,feature,importance,correlation,p_value,composite_score
0,复杂度-K,0.157743,-0.400663,5.383923e-13,0.254911
1,密度-WRDFRQC（94）,0.110616,-0.395947,1.059315e-12,0.224748
4,短语长度-PhraseLength,0.061412,0.295796,1.797028e-07,0.155166
2,单词数量-DESWC-03,0.106282,0.207064,0.0003053792,0.146595
6,句子长度-DESSL（06）,0.055977,0.256114,7.034158e-06,0.136032
8,因果衔接-SMCAUSr-64,0.053881,-0.174552,0.002413079,0.102149
13,句法相似-SYNSTRUT,0.035818,-0.200092,0.0004892763,0.101528
3,句子结构-SentenceStructure,0.063746,0.103995,0.07208042,0.079846
7,时体衔接-SMTEMP-68,0.055948,0.1,0.0837799,0.073569
12,意图衔接-SMINTEr-65,0.036176,0.124638,0.03091264,0.071561


In [20]:
#visualize_results(merged_df,feature_importance)
visualize_results_plotly(merged_df,feature_importance)

Unnamed: 0,feature,importance,correlation,p_value,composite_score
0,复杂度-K,0.157743,-0.400663,5.383923e-13,0.254911
1,密度-WRDFRQC（94）,0.110616,-0.395947,1.059315e-12,0.224748
4,短语长度-PhraseLength,0.061412,0.295796,1.797028e-07,0.155166
2,单词数量-DESWC-03,0.106282,0.207064,0.0003053792,0.146595
6,句子长度-DESSL（06）,0.055977,0.256114,7.034158e-06,0.136032
8,因果衔接-SMCAUSr-64,0.053881,-0.174552,0.002413079,0.102149
13,句法相似-SYNSTRUT,0.035818,-0.200092,0.0004892763,0.101528
3,句子结构-SentenceStructure,0.063746,0.103995,0.07208042,0.079846
7,时体衔接-SMTEMP-68,0.055948,0.1,0.0837799,0.073569
12,意图衔接-SMINTEr-65,0.036176,0.124638,0.03091264,0.071561


In [21]:
merged_df.to_csv('./res/reslut_inner_grade.csv', index=False)