## 2.1 词袋模型

### API使用

In [30]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

In [31]:
df = pd.read_csv('../data/spam.tsv', sep = '\t')
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [17]:
# 原始数据集形状，总共5572个样本
df.shape

(5572, 4)

In [7]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['message'])

In [19]:
# 使用词袋模型后，每个样本文本被扩展为长度为8713的向量
X.shape

(5572, 8713)

In [29]:
# 直接得到的X为有利于存储的稀疏矩阵格式
X

<5572x8713 sparse matrix of type '<class 'numpy.int64'>'
	with 74169 stored elements in Compressed Sparse Row format>

In [51]:
# 如果想查看数值可以使用todense函数
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [57]:
# get_feature_names函数查看自动产生的全部特征列
vectorizer.get_feature_names()[1000:1010]

['ahhhh',
 'ahmad',
 'ahold',
 'aid',
 'aids',
 'aig',
 'aight',
 'ain',
 'aint',
 'air']

###  探索CountVectorizer的参数

列举一些常用的参数：  
stop_words : 如果是英文预料，可以直接赋值'english'使用内置停用词, 如果是中文，可以赋值一个停用词列表，默认值为None  
ngram_range：格式为元组（min_n, max_n)，默认值为（1,1）表示只考虑单个词，（1,2）则表示2个词的词组也会被加入到词向量中  
max_df, min_df：这两个参数类似，可以输入一个0到1之间的浮点数，或整数，表示过滤掉出现在所有文档中出现频率过高或过低的词汇  
binary：默认为False，如果设为True，则所有出现的词汇数量都变成1，而不是实际出现的次数

In [64]:
vectorizer = CountVectorizer(ngram_range=(1,2),stop_words='english',max_df=0.8,min_df=0)
X = vectorizer.fit_transform(df['message'])

In [65]:
X.shape

(5572, 37364)

### 关于中文词向量

In [73]:
import jieba

In [70]:
df= pd.DataFrame({'text':['小明毕业于上海交通大学','李华在新华书店上班']})

In [71]:
df

Unnamed: 0,text
0,小明毕业于上海交通大学
1,李华在新华书店上班


In [76]:
df['text_split'] = df['text'].apply(lambda x: " ".join(jieba.cut(x,cut_all=False)))
df['text_split_cutall'] = df['text'].apply(lambda x: " ".join(jieba.cut(x,cut_all=True)))

In [77]:
df

Unnamed: 0,text,text_split,text_split_cutall
0,小明毕业于上海交通大学,小明 毕业 于 上海交通大学,小 明 毕业 于 上海 上海交通大学 交通 大学
1,李华在新华书店上班,李华 在 新华书店 上班,李 华 在 新华 新华书店 书店 上班


## 2.2 tfidf

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [3]:
df = pd.read_csv('../data/spam.tsv', sep = '\t')
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [29]:
df.shape

(5572, 4)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size = 0.2, random_state = 0, stratify = df['label'])

In [19]:
vectorizer= TfidfVectorizer()
tf_x_train = vectorizer.fit_transform(X_train)
tf_x_test = vectorizer.transform(X_test)

In [22]:
tf_x_train.toarray().shape

(4457, 7708)

In [23]:
clf = RandomForestClassifier()
clf.fit(tf_x_train, y_train)

RandomForestClassifier()

In [24]:
y_pred = clf.predict(tf_x_test)

In [27]:
confusion_matrix(y_test, y_pred)

array([[966,   0],
       [ 27, 122]], dtype=int64)

## 2.3 高级词向量

### 2.3.1 word2vec

In [32]:
import spacy

In [47]:
nlp = spacy.load('en_core_web_lg')

In [88]:
nlp('deep learning is very hard').vector.shape

(300,)

In [46]:
df = pd.read_csv('../data/spam.tsv', sep = '\t')
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [85]:
df['vector'] = df['message'].apply(lambda x: nlp(x).vector)

In [86]:
df.head()

Unnamed: 0,label,message,length,punct,vectoc
0,ham,"Go until jurong point, crazy.. Available only ...",111,9,"[0.022044934, 0.09757433, 0.002553665, -0.1926..."
1,ham,Ok lar... Joking wif u oni...,29,6,"[-0.07367852, -0.19237824, -0.1709596, -0.4884..."
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6,"[0.012893126, -0.0072731962, -0.006278193, 0.0..."
3,ham,U dun say so early hor... U c already then say...,49,6,"[-0.11613209, 0.17318583, -0.20066053, -0.3331..."
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2,"[0.0507494, 0.26546127, -0.2296746, -0.1811316..."


### 2.3.1 Glove

In [96]:
import numpy as np
glove_vectors = dict()

In [97]:
file = open('../glove/glove.6B.100d.txt', encoding='utf-8')

for line in file:
    values = line.split()
    
    word  = values[0]
    vectors = np.asarray(values[1:])
    glove_vectors[word] = vectors
    
file.close()

In [98]:
glove_vectors.get('king').shape

(100,)

In [99]:
vec_shape=100
def get_vec(x):
    arr = np.zeros(vec_shape)
    text = str(x).split()
    
    for t in text:
        try:
            vec = glove_vectors.get(t).astype(float)
            arr = arr + vec
        except:
            pass
        
    arr = arr.reshape(1, -1)[0]
    return arr/len(text)

In [100]:
df['vector1'] = df['message'].apply(lambda x: get_vec(x))

In [101]:
df.head()

Unnamed: 0,label,message,length,punct,vectoc,vector1
0,ham,"Go until jurong point, crazy.. Available only ...",111,9,"[0.022044934, 0.09757433, 0.002553665, -0.1926...","[0.015129099999999996, 0.08890985, 0.148991850..."
1,ham,Ok lar... Joking wif u oni...,29,6,"[-0.07367852, -0.19237824, -0.1709596, -0.4884...","[0.03301883333333333, 0.0234325, 0.14649500000..."
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6,"[0.012893126, -0.0072731962, -0.006278193, 0.0...","[-0.030968000000000006, 0.07641028571428572, 0..."
3,ham,U dun say so early hor... U c already then say...,49,6,"[-0.11613209, 0.17318583, -0.20066053, -0.3331...","[-0.08879572727272726, 0.23630183636363633, 0...."
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2,"[0.0507494, 0.26546127, -0.2296746, -0.1811316...","[-0.031486246153846154, 0.21729676923076927, 0..."
