# **数据分析**

In [None]:
import os
import ast
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle

pd.options.display.max_rows = 20
sns.set(style='darkgrid')

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input/quickdraw-doodle-recognition/train_simplified/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        break
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 导入简易化处理的数据

In [None]:
files_directory = os.listdir('/kaggle/input/quickdraw-doodle-recognition/train_simplified')
len(files_directory)

只提取前50个csv的文件内容保存进train中，不然内存储存空间不够用

In [None]:
number_categories = 50
files_directory = os.listdir('/kaggle/input/quickdraw-doodle-recognition/train_simplified')[:number_categories]
train = pd.DataFrame()
for file in files_directory:
    train = train.append(pd.read_csv('/kaggle/input/quickdraw-doodle-recognition/train_simplified/'+file, index_col='key_id', usecols=[1,2,3,5]))

随机打乱train，random_state为随机打乱的种子

In [None]:
train = shuffle(train, random_state=123)
train

In [None]:
print("Train number of rows: ", train.shape[0])
print("Train number of columns: ", train.shape[1])
print("Train set features: ", train.columns.values)
print("Train number of label categories: ", number_categories)
print("Train label categories: ", train['word'].unique())

# 利用SNS柱状图显示数据量前10名和后10名的类别

In [None]:
count_gp = train.groupby(['word']).size().reset_index(name='count').sort_values('count', ascending=False)
top_10 = count_gp[:10]
bottom_10 = count_gp[-10:]

In [None]:
ax_t10 = sns.barplot(x="word", y="count", data=top_10, palette="coolwarm", ci=500)
ax_t10.set_xticklabels(ax_t10.get_xticklabels(),rotation=40, ha="right")
plt.show()

In [None]:
ax_b10 = sns.barplot(x="word", y="count", data=bottom_10, palette="BrBG")
ax_b10.set_xticklabels(ax_b10.get_xticklabels(), rotation=40, ha='right')
plt.show()

In [None]:
sns.countplot(x="recognized", data=train)
plt.show()

In [None]:
train['recognized'].value_counts()

# 查看每个类别中认知正确和错误的数量

In [None]:
rec_gp = train.groupby(["word","recognized"]).size().reset_index(name="count")
rec_true = rec_gp[rec_gp['recognized']==True].rename(index=str, columns={"recognized":"recognized_true", "count":"count_true"})
rec_false = rec_gp[rec_gp['recognized']==False].rename(index=str, columns={"recognized":"recognized_false", "count":"count_false"})
rec_gp = rec_true.set_index('word').join(rec_false.set_index('word'), on='word')
rec_gp

# 图像化输入数据

In [None]:
train[:1]['drawing'].values[0]

In [None]:
words = train['word'].tolist()
drawings = [ast.literal_eval(pts) for pts in train[:9]['drawing'].values]

plt.figure(figsize=(10,10))
for i, drawing in enumerate(drawings):
    plt.subplot(330 + (i+1))
    for x, y in drawing:
        plt.plot(x, y, marker=".")
        plt.tight_layout()
        plt.title(words[i])
        plt.axis('off')