# All data

## Init

### Imports

In [1]:
import pandas as pd

from sklearn.datasets.lfw import Bunch

### Args

In [2]:
args = Bunch()

## Read

In [3]:
args.data_spec = {
    'docstring', 'stackoverflow', 'stepik'
}
args.shuffle = True

In [4]:
def read_data(data_name):
    df = pd.read_hdf(f'{data_name}_data.hdf5')
    df['source'] = data_name
    return df

In [5]:
dfs = [read_data(data_name) for data_name in args.data_spec]
df = pd.concat(dfs, axis=0, ignore_index=True)
df.shape

(755566, 3)

In [6]:
df.head()

Unnamed: 0,text,tags,source
0,How can I find the full path to a font from it...,fonts osx photoshop python,stackoverflow
1,Get a preview JPEG of a PDF on Windows? DCNL I...,image pdf python windows,stackoverflow
2,Continuous Integration System for a Python Cod...,continuous-integration extreme-programming python,stackoverflow
3,cx_Oracle: How do I iterate over a result set?...,cx-oracle database oracle python sql,stackoverflow
4,Using 'in' to match an attribute of Python obj...,arrays iteration python,stackoverflow


## Preprocess

In [7]:
args.shuffle = True

In [8]:
if args.shuffle:
    df = df.sample(frac=1).reset_index(drop=True)
df.shape

(755566, 3)

In [9]:
df.head()

Unnamed: 0,text,tags,source
0,Calculate the extended gcd using a recursive f...,python recursion,stackoverflow
1,How to create a set of neighboring nodes based...,neighbours network-programming python set,stackoverflow
2,What is non-blocking generator DCNL This excer...,coroutine generator nonblocking python,stackoverflow
3,pandas read_csv not converting string to date ...,csv date pandas python,stackoverflow
4,Get the public email address of a Facebook use...,facebook-graph-api python,stackoverflow


## Analysis

In [10]:
df.describe()

Unnamed: 0,text,tags,source
count,755566,755566,755566
unique,755559,373813,3
top,Reverse proxy capable pure python webserver? D...,python,stackoverflow
freq,2,43304,607185


## Save

In [11]:
args.save_path = 'all_data.hdf5'

In [12]:
%time df.to_hdf(args.save_path, 'df', mode='w', format='f', complevel=9)

CPU times: user 1.3 s, sys: 680 ms, total: 1.98 s
Wall time: 2.49 s


In [13]:
!du -sh $args.save_path

394M	all_data.hdf5


## End

In [14]:
args

{'data_spec': {'docstring', 'stackoverflow', 'stepik'},
 'save_path': 'all_data.hdf5',
 'shuffle': True}