In [None]:
# This notebook explores the character distribution of description.

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import unicodedata
from sklearn.feature_extraction.text import CountVectorizer
from matplotlib import pyplot

Let's look at the character distribution of the description by counting all the characters with a CountVectorizer.

In [None]:
train = pd.read_csv('../input/train.csv', usecols=['description', 'deal_probability'])
test = pd.read_csv('../input/test.csv', usecols=['description'])

df = pd.concat((train, test))

df.index = range(df.shape[0])

In [None]:
charvec = CountVectorizer(
    analyzer='char',
    lowercase=False,
    max_df=1.0,
    min_df=1
)

In [None]:
char_counts = charvec.fit_transform(df['description'].fillna(''))

We got 1749 different characters. Neat!

In [None]:
char_counts

We can add them all together to get the total character distribution from all the descriptions.

In [None]:
totals = pd.DataFrame(
    np.array(char_counts.sum(axis=0))[0], 
    index=charvec.get_feature_names(),
    columns=['cnt']
)

We can also use unicodedata to capture some meta information of the caaracters.

In [None]:
totals['ord'] = totals.index.map(lambda x: ord(x))
totals['cat'] = totals.index.map(lambda x: unicodedata.category(x))

In [None]:
def extract_name(x):
    try:
        if '\t' == x:
            return 'CHARACTER TABULATION'
        if '\n' == x:
            return 'LINE FEED'
        return unicodedata.name(x)
    except:
        return None
    
totals['name'] = totals.index.map(extract_name)

In [None]:
totals['name'].fillna('', inplace=True)

The two-char codes in *cat* stand for different character sets. For example Ll stands for lowercase letter, Zs stands for space and separator. You can check each category them out [here](https://en.wikipedia.org/wiki/Unicode_character_property).

In [None]:
r = totals.groupby('cat').cnt.agg(['count', 'sum'])
r.sort_values('sum', ascending=False)

The majority is lower case letters, spaces, punctuation and upper case letters. There are also some numeric characters.

In [None]:
(r / r.sum()).sort_values('sum', ascending=False).plot(kind='bar', figsize=(12, 4))

Let's look at the mean of deal_probability for different character set counts. For eg. punctuation.

In [None]:
charset_idx = np.array(range(totals.shape[0]))[totals['cat'] == 'Po']

Let's bin the input to draw a neato chart

In [None]:
df['charset_cnt'] = np.log2(char_counts[:, charset_idx].sum(axis=1) + 1).astype(int)

In [None]:
df.groupby('charset_cnt').deal_probability.mean().plot(kind='bar', color='#7777ac')

In [None]:
del df['charset_cnt']

We can do this for all character sets eventually.

In [None]:
for cat in totals['cat'].unique():
    print(cat)
    feature = 'charset_{}_cnt'.format(cat)
    charset_idx = np.array(range(totals.shape[0]))[totals['cat'] == cat]
    df[feature] = np.log2(char_counts[:, charset_idx].sum(axis=1) + 1).astype(int)

In [None]:
nu_cats = totals.cat.nunique()

In [None]:
nu_cats

In [None]:
charset_cols = list(filter(lambda x: x.startswith('charset_'), df.columns))

In [None]:
max_vals = df[charset_cols].max().sort_values(ascending=False)

In [None]:
max_vals[max_vals <= 8].index.shape

In [None]:
f, axes = pyplot.subplots(3, 3, sharey=True, figsize=(15, 10))
axes = axes.flatten()
for k, feat in enumerate(max_vals[max_vals > 8].index):
    r = df.groupby(feat).deal_probability.agg(['count', 'mean'])
    r['pcnt_cnt'] = r['count'] / r['count'].sum() 
    r[['pcnt_cnt', 'mean']].plot(kind='bar', color=['#667799', '#aa3366'], ax=axes[k], title=feat)
    axes[k].set_xlabel('')

In [None]:
f, axes = pyplot.subplots(3, 6, sharey=True, figsize=(15, 10))
axes = axes.flatten()
for k, feat in enumerate(max_vals[max_vals <= 8].index):
    r = df.groupby(feat).deal_probability.agg(['count', 'mean'])
    r['pcnt_cnt'] = r['count'] / r['count'].sum() 
    r[['pcnt_cnt', 'mean']].plot(kind='bar', color=['#667799', '#aa3366'], ax=axes[k], title=feat)
    axes[k].set_xlabel('')

In [None]:
totals['idx'] = range(totals.shape[0])

In [None]:
totals.loc['!']

In [None]:
df['!_cnt'] = char_counts[:, 3]