In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [3]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer #将原始文档的集合转换为TF-IDF特性的矩阵
from sklearn.feature_selection import chi2 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB #(多项式)朴素贝叶斯
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC #线性支持向量机
from sklearn.model_selection import cross_val_score
import seaborn as sns
from sklearn.metrics import confusion_matrix #混淆矩阵,用于展示两个向量之间的差异
from IPython.display import display
from sklearn import metrics
%matplotlib inline

## 问题描述
该问题是监督式文本分类问题，我们的目标是调查哪种监督式机器学习方法最适合解决它。
当出现新投诉时，我们希望将其分配到 12 个类别中的一个。分类器假设每个新投诉都被分配到一个且仅一个的类别之中。这是多类别文本分类问题。

## 数据探索

In [4]:
df = pd.read_csv('../input/consumer_complaints.csv')
df.head()

In [5]:
df_new = df[df['consumer_complaint_narrative'].notnull()]

In [6]:
df.shape

In [7]:
df_new.shape

In [8]:
df_new.info()

In [9]:
df_new.isnull().sum()

In [10]:
df_new = df_new[['product','consumer_complaint_narrative']]

In [11]:
df_new.info()

In [12]:
le = LabelEncoder()
le.fit(df_new['product'].unique())
df_new['category_id'] = le.transform(df_new['product'])

In [13]:
df_new.head()

In [14]:
df_new['category_id'].unique()

In [15]:
df_new = df_new.reset_index()

In [16]:
category_id_df = df_new[['product','category_id']].drop_duplicates().sort_values('category_id')

In [17]:
category_id_df

In [18]:
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id','product']].values)

In [19]:
category_to_id

In [20]:
id_to_category

In [21]:
fig = plt.figure(figsize=(8,6))
df_new.groupby('product').consumer_complaint_narrative.count().plot.bar(ylim=0)

上图显示：看到每件产品的投诉数量不平衡。消费者的投诉更集中于收取欠款、信用报告和抵押方面。

当我们遇到这样的问题时，我们使用标准算法解决这些问题必然会遇到困难。**常规算法往往偏向于多数类别，而不考虑数据分布**。在最糟糕的情况下，少数类别被视为异常值并被忽略。对于某些情况，如欺诈检测或癌症预测，我们则需要**仔细配置我们的模型或人为地平衡数据集**，比如**欠采样或过采样每个类别**。


但是，在学习不平衡数据的情况下，我们最感兴趣的是多数类。我们想有一个分类器，能够**对多数类提供较高的预测精度，同时对少数类保持合理的准确度**。因此我们会保持原样。

In [22]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1,2),stop_words='english')

In [23]:
features = tfidf.fit_transform(df_new.consumer_complaint_narrative).toarray()

In [24]:
features.shape

使用159200个特征来描述消费者的投诉信息
## 接下来，使用 sklearn.feature_selection.chi2 来查找与每个产品最相关的项

In [25]:
labels = df_new.category_id

In [None]:
N = 2
for product, category_id in sorted(category_to_id.items()):
    feature_chi2 = chi2(features, labels==category_id)
    indices = np.argsort(features_chi2[0]) #根据value对index重排,返回重排后的index
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' '))==1]
    biggrams = [v for v in feature_names if len(v.split(' '))==2]
    print("# '{}':".format(product))
    print(".Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
    print(".Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))

## 多类别分类器：特征和设计
- 为了训练监督式分类器，我们首先将「消费者投诉叙述」转化为数字向量。我们研究了向量表示，例如 TF-IDF 加权向量
- 有了这个向量表达的文本后，我们可以训练监督式分类器来训练看不到的「消费者投诉叙述」并预测它们的「产品」

## 朴素贝叶斯分类器：最适合字数统计的是多项式变体

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_new['consumer_complaint_narrative'],df_new['product'],random_state=0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB().fit(X_train_tfidf, y_train)

### 拟合好训练集后，让我们做一些预测

In [None]:
print(clf.predict(count_vect.transform(["This company refuses to provide me verification and validation of debt per my right under the FDCPA. I do not believe this debt is mine."])))

In [None]:
#检验预测是否准确
df_new[df_new['Consumer_complaint_narrative'] == "This company refuses to provide me verification and validation of debt per my right under the FDCPA. I do not believe this debt is mine."]

## 模型选择
> 尝试不同的机器学习模型，评估它们的准确性并找出潜在问题的根源
使用以下4中模型来训练：
- logistic regression
-  (多项式)朴素贝叶斯
- 线性支持向量机
- 随机森林

In [None]:
models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]
CV = 5
cv_df = pd.DataFrame(index=range(CV*len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(mode, features, labels, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((nodel_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx','accuracy'])
sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, size=8, jitter=True, edgecolor='gray',linewidth=2)
plt.show()

In [None]:
#输出四种模型的交叉验证最终的平均分数分别是多少
cv_df.groupby('model_name').accuracy.mean()

## 模型评估
使用前面显示的最佳模型,线性支持向量机。查看混淆矩阵，并展示预测标签和实际标签之间的差异

In [None]:
model = LinearSVC()
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, df_new.index, test_size=0.33, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize==(10,10))
sns.heatmap(conf_mat, annot=True, fmt='d', 
           xticklabels=category_id_df.product.values, yticklabels=category_id_df.product.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

如图所示，大多数的预测值都在对角线上(预测值=实际值)，但是还是存在不少被错误分类的样本。

In [None]:
for predicted in category_id_df.category_id:
    for actual in category_id_df.category_id:
        if predicted!=actual and conf_mat[actual, predicted]>=10:
            print("'{}' predicted as '{}':{} examples.".format(id_to_category[actual],id_to_category[predicted],conf_mat[actual, predicted]))
            display(df_new.loc[indices_test[(y_test==actual)&(y_pred==predicted)]][['product','consumer_complaint_narrative']])
            print('')

上面输出的信息可以知道，一些错误分类的投诉信息涉及多个主题。
下面，我们使用卡方检验来找到与每个类别最相关的项:

In [None]:
model.fit(features, labels)
N =2
for product, category_id in sorted(category_to_id.items()):
    indices = np.argsort(model.coef_[category_id])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in reversed(feature_names) if len(v.split(' '))==1][:N]
    bigrams = [v for v in reversed(feature_names) if len(v.split(' '))==2][:N]
    print("# '{}':".format(product))
    print(" . Top unigrams:\n . {}".format('\n . '.join(unigrams)))
    print(" . Top bigrams:\n . {}".format('\n . '.join(bigrams)))

In [None]:
#最后，输出每个类别的分类报告
print(metrics.classification_report(y_test, y_pred, target_names=df_new['product'].unique()))