In [322]:
from pathlib import Path
import pandas as pd
import numpy as np
import re
import jieba as jb
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score

In [193]:
data_path = Path('./data')
categories = [d.name for d in data_path.glob('高中*')]
categories
subcategories = {catg: [d.stem for d in data_path.joinpath(f'{catg}/origin/').glob('*.csv')] for catg in categories}
dfs = {
    catg:{
        sub_catg: pd.read_csv(data_path.joinpath(f'{catg}/origin/{sub_catg}.csv')) for sub_catg in subcategories[catg]
    }
    for catg in subcategories
}

cols = ['题目', '题型', '难度', '答案', '解析']

In [194]:
subcategories

{'高中_历史': ['古代史', '现代史', '近代史'],
 '高中_地理': ['人口与城市', '区域可持续发展', '地球与地图', '宇宙中的地球', '生产活动与地域联系'],
 '高中_政治': ['公民道德与伦理常识', '时事政治', '生活中的法律常识', '科学思维常识', '科学社会主义常识', '经济学常识'],
 '高中_生物': ['分子与细胞', '现代生物技术专题', '生物技术实践', '生物科学与社会', '稳态与环境', '遗传与进化']}

In [195]:
split_points = [
    '题型:',
    '难度:',
    '使用次数:',
    '答案：',
    '解析：',
    '[题目]',
    '[知识点：]',
    '知识点：',
]
for s in split_points:
    for i in dfs:
        for j in dfs[i]:
            if not dfs[i][j].item.str.contains(s).all():
                print(f'{i} - {j} does not all contain {s}')

高中_历史 - 现代史 does not all contain 知识点：


In [196]:
def preprocessing(df):
    item_col = df['item'].copy().str.replace('\[题目\]|\[知识点：\]|\\n|（\s*）|\(\s*\)|【解答】|解：', '')
    
    split = item_col.str.split('题型:').str
    df.loc[:, '题目'] = split[0].str.strip()
    
    split = split[1].str.split('\|难度:').str
    df.loc[:, '题型'] = split[0].str.strip()
    
    split = split[1].str.split('\|使用次数:').str
    df.loc[:, '难度'] = split[0].str.strip()
    
    split = split[1].str.split('答案：').str[1].str.split('解析：').str
    df.loc[:, '答案'] = split[0].str.strip()
    
    split = split[1].str.split('知识点：').str
    df.loc[:, '解析'], df.loc[:, '知识点'] = split[0].str.strip(), split[1].str.strip()
    return df

In [197]:
processed_dfs = {
    catg:{
        sub_catg: preprocessing(dfs[catg][sub_catg]) for sub_catg in subcategories[catg]
    }
    for catg in subcategories
}

In [366]:
total_df = pd.DataFrame()
for catg in processed_dfs:
    for sub_catg in processed_dfs[catg]:
        df = processed_dfs[catg][sub_catg][cols].copy()
        df.loc[:, 'combined_text'] = df[cols].fillna('').agg(' '.join, axis=1)
        df.loc[:, 'catg'] = catg
        df.loc[:, 'sub_catg'] = sub_catg
        total_df = pd.concat([total_df, df], axis=0)
total_df = total_df.reset_index(drop=True)

In [368]:
def segment_line(line):
    """
    预处理和分词
    """
    line = re.sub(
            "[a-zA-Z0-9]|[\s+\-\|\!\/\[\]\{\}_,.$%^*(+\"\')]+|[:：+——()?【】《》“”！，。？、~@#￥%……&*（）]+|题目", '',line)
    tokens = jb.cut(line, cut_all=False)
    return " ".join(tokens)

In [369]:
total_df.loc[:, 'split_combined'] = total_df['combined_text'].apply(segment_line)

In [370]:
for col in 'catg', 'sub_catg':
    total_df.loc[:, col] = total_df[col].astype('category')

In [371]:
total_df.head()

Unnamed: 0,题目,题型,难度,答案,解析,combined_text,catg,sub_catg,split_combined
0,据《左传》记载，春秋后期鲁国大夫季孙氏的家臣阳虎独掌权柄后，标榜要替鲁国国君整肃跋扈的大夫，...,单选题,一般,D,阳虎的身份是鲁国大夫、季孙氏的家臣，按周礼的规定，他效忠于季孙氏，而他标榜为鲁国国君整肃大夫...,据《左传》记载，春秋后期鲁国大夫季孙氏的家臣阳虎独掌权柄后，标榜要替鲁国国君整肃跋扈的大夫，...,高中_历史,古代史,据 左传 记载 春秋 后期 鲁国 大夫 季孙氏 的 家臣 阳虎 独掌 权柄 后 标榜 要 替...
1,秦始皇统一六国后创制了一套御玺。如任命国家官员，则封印“皇帝之玺”；若任命四夷的官员，则用“...,单选题,一般,D,本题要求选择否定项，据材料提到，秦始皇统一六国后创制了一套御玺，如任命国家官员，则封印“皇帝...,秦始皇统一六国后创制了一套御玺。如任命国家官员，则封印“皇帝之玺”；若任命四夷的官员，则用“...,高中_历史,古代史,秦始皇 统一 六国后 创制 了 一套 御玺 如 任命 国家 官员 则 封印 皇帝 之玺 ； ...
2,北宋加强中央集权的主要措施有①把主要将领的兵权收归中央②派文官担任地方长官③设置通判监督地方...,单选题,一般,B,本题考查北宋加强中央集权的主要措施，结合所学知识可知，北宋把主要将领的兵权收归中央，派文官担...,北宋加强中央集权的主要措施有①把主要将领的兵权收归中央②派文官担任地方长官③设置通判监督地方...,高中_历史,古代史,北宋 加强 中央集权 的 主要 措施 有 ① 把 主要 将领 的 兵权 收归 中央 ② 派 ...
3,商朝人崇信各种鬼神，把占卜、祭祀作为与神灵沟通的手段，负责通神事务的是商王和巫师（往往出身贵...,单选题,一般,B,据材料“商代，王是人，也是神，既是王朝的元首，又是群巫之长”并结合所学知识可知，这反映了商代...,商朝人崇信各种鬼神，把占卜、祭祀作为与神灵沟通的手段，负责通神事务的是商王和巫师（往往出身贵...,高中_历史,古代史,商朝人 崇信 各种 鬼神 把 占卜 祭祀 作为 与 神灵 沟通 的 手段 负责 通神 事务 ...
4,公元963年，北宋政府在江淮地区设置了包括盐业管理，以及控制对茶叶销售的专卖等为主要职责的转...,单选题,一般,A,A从材料中可以看出，转运使逐渐变为地方行政长官，表明中央对地方的控制日益加强，中央集权不断加...,公元963年，北宋政府在江淮地区设置了包括盐业管理，以及控制对茶叶销售的专卖等为主要职责的转...,高中_历史,古代史,公元 年 北宋 政府 在 江淮地区 设置 了 包括 盐业 管理 以及 控制 对 茶叶 销售 ...


## Multinomial Naive Bayes

Assumption:

\begin{equation}
P(\text{class } c = i \mid \text{document } d) = \frac{P(d \mid c_i)P(c_i)}{P(d)} \propto P(d \mid c_i)P(c_i)
\end{equation}

, where

\begin{align}
P(d | c_i) & = P(n_{w_1}, \dots, n_{w_M} \mid c_i) \sim Multinomial(p_{i, 1}, \dots, p_{i, m}) \\
& \overset{\text{Multinomial Naive Bayes}}{=} n!\prod_{i=1}^M \frac{P(n_{w_i} \mid c_i)^{n_{w_i}}}{n_{w_i}!} \\
\end{align}

- $n$ is the number of words in document $d$
- $n_{w_i}$ is the number of times word $i$ appears in document $d$ 
- the MLE for $P(n_{w_i} \mid c_i)$ is $\dfrac{\sum_{d \in c_i}n_{w_i; d}}{\sum_{d \in c_i}n_d}$

Hence

\begin{equation}
\log(P(c_i \mid d)) = log(P(c_i)) + \sum_{i=1}^M n_{w_i}log(P(n_{w_i} \mid c_i)) + \sum_{j=1}^{n} j - \sum_{i=1}^M\sum_{j=1}^{n_{w_i}} j
\end{equation}

In [372]:
cv = CountVectorizer(min_df=100)

In [373]:
features = cv.fit_transform(total_df['split_combined']).toarray()

## First Level Categories

In [384]:
x_train, x_test, y_train, y_test = train_test_split(features, total_df['catg'], stratify=total_df['catg'])

In [385]:
x_train.shape

(22359, 2525)

In [405]:
x_test.shape

(7454, 2525)

In [386]:
# Fitting
# Compute probability with Laplace smoothing

laplace_coef = 0.1

classes = y_train.unique()
cond_probs = {}
class_probs = {}

for c in classes:
    c_counts = x_train[y_train == c, :] + laplace_coef
    cond_probs.update({c: c_counts.sum(axis = 0) / c_counts.sum()})
    class_probs.update({c: (y_train == c).sum() / y_train.shape[0]})

In [387]:
# predicting

def sum_seq(n):
    return (1 + n) * n / 2

def predict(x, class_probs, cond_probs, ):
    classes = list(cond_probs.keys())
    logprobs = []
    for c in classes:
        logprob = (
            np.log(class_probs[c]) + 
            np.einsum('ij, j -> i', x, np.log(cond_probs[c])) + 
            sum_seq(x.sum(axis=1)) - 
            sum_seq(x).sum(axis=1)
        )
        logprobs.append(logprob)
    return np.array(logprobs).T

def predict_classes(x, class_probs, cond_probs, classes, ):
    pred = predict(x, class_probs, cond_probs, )
    return classes[pred.argmax(axis=1)]

def get_class_metrics(y_true, y_pred, classes):
    res = {}
    for c in classes:
        class_y = y_true == c
        class_pred = y_pred == c
        recall = recall_score(class_y, class_pred)
        precision = precision_score(class_y, class_pred)
        accuracy = accuracy_score(class_y, class_pred)
        res.update({
            c:{
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
            }
        })
    return res

In [389]:
pred_train = predict_classes(x_train, class_probs, cond_probs, classes, )
pred_test = predict_classes(x_test, class_probs, cond_probs, classes, )

train_df = pd.DataFrame(get_class_metrics(y_train, pred_train, classes, ))
test_df = pd.DataFrame(get_class_metrics(y_test, pred_test, classes, ))

train_df.loc[:, 'sample'] = 'train'
train_df = train_df.reset_index().rename({'index': 'metric'}, axis=1).set_index(['sample', 'metric'])
test_df.loc[:, 'sample'] = 'test'
test_df = test_df.reset_index().rename({'index': 'metric'}, axis=1).set_index(['sample', 'metric'])

In [393]:
pd.concat([train_df.T, test_df.T], axis=1)

sample,train,train,train,test,test,test
metric,accuracy,precision,recall,accuracy,precision,recall
高中_生物,0.990742,0.982251,0.998691,0.990341,0.981262,0.998878
高中_地理,0.985017,0.990301,0.947124,0.986987,0.990196,0.955481
高中_历史,0.984525,0.932685,0.97773,0.984975,0.938033,0.974256
高中_政治,0.977817,0.923236,0.878288,0.976523,0.92125,0.86808


## Second Level Categories

In [410]:
x_train, x_test, y_train, y_test = train_test_split(features, total_df['sub_catg'], stratify=total_df['sub_catg'])

In [411]:
# Fitting
# Compute probability with Laplace smoothing

laplace_coef = 0.1

classes = y_train.unique()
cond_probs = {}
class_probs = {}

for c in classes:
    c_counts = x_train[y_train == c, :] + laplace_coef
    cond_probs.update({c: c_counts.sum(axis = 0) / c_counts.sum()})
    class_probs.update({c: (y_train == c).sum() / y_train.shape[0]})

In [412]:
pred_train = predict_classes(x_train, class_probs, cond_probs, classes, )
pred_test = predict_classes(x_test, class_probs, cond_probs, classes, )

train_df = pd.DataFrame(get_class_metrics(y_train, pred_train, classes, ))
test_df = pd.DataFrame(get_class_metrics(y_test, pred_test, classes, ))

train_df.loc[:, 'sample'] = 'train'
train_df = train_df.reset_index().rename({'index': 'metric'}, axis=1).set_index(['sample', 'metric'])
test_df.loc[:, 'sample'] = 'test'
test_df = test_df.reset_index().rename({'index': 'metric'}, axis=1).set_index(['sample', 'metric'])

In [413]:
pd.concat([train_df.T, test_df.T], axis=1)

sample,train,train,train,test,test,test
metric,accuracy,precision,recall,accuracy,precision,recall
分子与细胞,0.971689,0.847054,0.87472,0.969815,0.840314,0.861745
公民道德与伦理常识,0.982468,0.87785,0.816667,0.982828,0.88806,0.811364
生物科学与社会,0.942171,0.841709,0.687179,0.940837,0.830446,0.688205
古代史,0.987253,0.809587,0.810667,0.987389,0.842105,0.768
宇宙中的地球,0.989042,0.973194,0.937926,0.988463,0.967814,0.938644
人口与城市,0.988103,0.906334,0.863212,0.986719,0.882812,0.862595
稳态与环境,0.978353,0.92385,0.89279,0.976389,0.909714,0.891377
生物技术实践,0.951876,0.698262,0.333082,0.950631,0.669683,0.334086
现代生物技术专题,0.949595,0.390343,0.894667,0.949826,0.391986,0.9
遗传与进化,0.960374,0.455312,0.692308,0.960021,0.448925,0.642308
