# TextCNN尝试

In [None]:
import pandas as pd
import numpy as np
import configparser
import pymysql
import matplotlib.pyplot as plt
import jieba
import jieba.analyse
import gensim
import pickle
import fasttext
from sklearn.model_selection import train_test_split
from collections import Counter
%matplotlib inline
import fool

In [None]:
config = configparser.ConfigParser()
config.read("../Data/Input/database_config/database.conf")
host = config['ASSESSMENT']['host']
user = config['ASSESSMENT']['user']
password = config['ASSESSMENT']['password']
database = config['ASSESSMENT']['database']
port = config['ASSESSMENT']['port']
charset = config['ASSESSMENT']['charset']
db = pymysql.connect(host=host, user=user, password=password, db=database, port=int(port), charset=charset)

## 一、数据预处理

### 从数据库读取标签和文本信息

In [None]:
sql_intro = "select * from company_intro_info_latest0601"
sql_tag = "select * from company_tag_info_latest0601"
data_intro = pd.read_sql(sql_intro, con=db)
data_tag = pd.read_sql(sql_tag, con=db)

In [None]:
# 剔除非概念标签和技术标签
data_ctag_only = data_tag[(data_tag.remarks != "1") & (data_tag.classify_id != 4)].copy()

In [None]:
data_ctag_only.head(1)

In [None]:
data_intro.head(1)

In [None]:
ctag_count = data_ctag_only.groupby("label_name").agg({"comp_full_name": "count"}).sort_values(by="comp_full_name", ascending=False).reset_index()

In [None]:
ctag_count[ctag_count.comp_full_name >= 1000]

### 分离出每个标签的标签链条，过滤进行中的产业链

In [None]:
def src_tags_splitter(src_tags):
    links = src_tags.split("#")
    tags_lists = [link.split("-") for link in links]
    tags = set([x for y in tags_lists for x in y])
    return tags

In [None]:
data_ctag_only["one_src_tags"] = data_ctag_only[["label_type_num", "src_tags"]].apply(lambda x: x[1].split("#")[x[0] - 1], axis=1)

In [None]:
data_ctag_only["tags_list"] = data_ctag_only.src_tags.apply(lambda x: src_tags_splitter(x))

In [None]:
filter_list = "泛娱乐,生物科学,科技金融,智能网联汽车,科技物流,文化娱乐行业,数字媒体行业,广告营销行业,游戏行业,教育培训行业,电子商务行业,汽车交通,房产家装,医疗健康行业,新零售".split(",")

In [None]:
data_ctag_filtered = data_ctag_only[~data_ctag_only.one_src_tags.apply(lambda x: x.split("-")[0]).isin(filter_list)][["comp_id", "comp_full_name", "label_name", "src_tags", "one_src_tags", "tags_list"]].copy()

In [None]:
data_ctag_filtered.head()

In [None]:
ctag_filtered_count = data_ctag_filtered.groupby("label_name").agg({"comp_full_name": "count"}).sort_values(by="comp_full_name", ascending=False).reset_index()

### 根据公司名字过滤公司简介，并作合并、分词

In [None]:
data_intro_filtered = data_intro[(data_intro.comp_id.isin(data_ctag_filtered.comp_id)) & (data_intro.classify_id != 4)][["comp_id", "comp_full_name", "intro"]].copy()

In [None]:
data_intro_filtered.head(1)

In [None]:
data_intro_filtered_merged = data_intro_filtered.groupby("comp_id").agg({"comp_full_name": max, "intro": lambda x: "。".join(x)})

In [None]:
len(data_intro_filtered_merged), len(set(data_intro_filtered.comp_id)), len(set(data_ctag_filtered.comp_id))

In [None]:
data_intro_filtered_merged["words"] = data_intro_filtered_merged.intro.apply(lambda x: jieba.lcut(x.strip()))

In [None]:
data_intro_filtered_merged.head(2)

### 去除停用词（标点）以及单字

In [None]:
stopwords = open("../Data/Input/text_similarity/stopwords.txt", "r").read().split("\n")
stopwords[0] = "，"
stopwords = set(stopwords)

In [None]:
data_intro_filtered_merged.words = data_intro_filtered_merged.words.apply(lambda x: list(filter(lambda w: len(w) > 1 and w not in stopwords, x)))

In [None]:
data_intro_filtered_merged.head(1)

In [None]:
model = gensim.models.KeyedVectors.load_word2vec_format("../Data/sgns.baidubaike.bigram-char")

In [None]:
pickle.dump(data_ctag_filtered, open("../Data/Input/Text_CNN/data_ctag_filtered.pkl", "wb"))
pickle.dump(data_intro_filtered_merged, open("../Data/Input/Text_CNN/data_intro_filtered_merged.pkl", "wb"))

In [None]:
data_ctag_filtered = pickle.load(open("../Data/Input/Text_CNN/data_ctag_filtered.pkl", "rb"))
data_intro_filtered_merged = pickle.load(open("../Data/Input/Text_CNN/data_intro_filtered_merged.pkl", "rb"))

In [None]:
words_raw = data_intro_filtered_merged.words.tolist()
words_list = [x for y in words_raw for x in y]
words_count = Counter(words_list)

## 二、词向量和分类模型测试

### fasttext词向量

In [None]:
fast_text_model = gensim.models.FastText(words_raw, min_count=5, size=300)

In [None]:
fast_text_model.wv.most_similar(["科技", "金融"], topn=100)

### fastest分类测试（只取底级标签）

In [None]:
label_list = ctag_filtered_count[ctag_filtered_count.comp_full_name >= 500].label_name.tolist()

In [None]:
label_data_to_use = data_ctag_filtered[(data_ctag_filtered.label_name.isin(label_list)) \
                                       & (data_ctag_filtered[["label_name", "one_src_tags"]].apply(lambda x: x[0] == x[1].split("-")[-1] , axis=1))]

In [None]:
all_data = label_data_to_use.merge(data_intro_filtered_merged.reset_index(), how="left", left_on="comp_id", right_on="comp_id")[["comp_id", "label_name", "words"]]

In [None]:
all_data.label_name = all_data.label_name.apply(lambda x: "__label__" + x)

In [None]:
all_data["data_to_file"] = all_data[["label_name", "words"]].apply(lambda x: " ".join([x[0], " ".join(x[1])]), axis=1)

In [None]:
all_data.head()

In [None]:
train_data, test_data = train_test_split(all_data, train_size=0.8)

In [None]:
train_file = open("../Data/Input/Text_CNN/fasttext_train", "w")
train_file.write("\n".join(train_data.data_to_file.tolist()))
train_file.close()

In [None]:
test_file = open("../Data/Input/Text_CNN/fasttext_test", "w")
test_file.write("\n".join(test_data.data_to_file.tolist()))
test_file.close()

In [None]:
classifier = fasttext.supervised("../Data/Input/Text_CNN/fasttext_train", "fasttext.model", dim=300, loss="hs")

In [None]:
result = classifier.test("../Data/Input/Text_CNN/fasttext_test")

In [None]:
index_num = 0
test_text = " ".join(test_data.iloc[0].data_to_file.split(" ")[1:])

In [None]:
test_df = all_data[["label_name", "words"]].copy()

In [None]:
test_df["top_result"] = classifier.predict(test_df.words.apply(lambda x: " ".join(x)).tolist(),k=3)
test_df["real_label"] = test_df.label_name.apply(lambda x: x.strip("__label__"))
test_df["pred_result"] = test_df[["real_label", "top_result"]].apply(lambda x: x[0] in x[1], axis=1)

In [None]:
sum(test_df.pred_result)/len(test_df)

In [None]:
test_df

#### 思路记录
可以选取大语料（优质、不优质都包含）训练词向量，然后任意两组词可以计算距离，那么等同于可以计算任意两个公司的距离

## 三、取全量语料测试