# All data

## Init

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('..')

In [3]:
import random
import pandas as pd
import seaborn as sns

from utils import examine_df, monitor
from sklearn.datasets.lfw import Bunch
from stags_collector import StagsCollector

In [4]:
%matplotlib inline
sns.set_style('darkgrid')

In [5]:
args = Bunch()

## Read

In [6]:
args.data_spec = [
    'docstring', 'stackoverflow', 'stepik'
]

In [7]:
def read_data(data_name):
    df = pd.read_hdf(f'{data_name}_data.hdf5')
    df['source'] = data_name
    return df

In [8]:
df = pd.concat([read_data(data_name) for data_name in args.data_spec], axis=0, ignore_index=True)
examine_df(df)

shape=(892713, 3)


Unnamed: 0_level_0,name,text,tags,source
Unnamed: 0_level_1,pos,0,1,2
head,0,load a Python source file and compile it to by...,,docstring
head,1,load a module without importing it DCNL _load_...,,docstring
head,2,Compare two code-objects. DCNL This is the mai...,,docstring
head,3,Compare a .pyc with a source code file.,,docstring
head,4,Find globals in this statement.,,docstring
tail,892708,There are two horses on a chess board and four...,easy python,stepik
tail,892709,Fizz Buzz is a classic programming problem. He...,medium python,stepik
tail,892710,"Find the number of ""Ds"", ""Cs"", ""Bs"" and ""As"" f...",hard python,stepik
tail,892711,Sum all input numbers in one pass func.,,stepik
tail,892712,You are given a string s consisting of n lower...,,stepik


## Stags

In [9]:
args.stags_total_up = 500
args.log_step = 5000

In [10]:
collector = StagsCollector(
    args.stags_total_up,
    monitor=monitor('[1.8] Collect stags', log_step=args.log_step),
    do_tqdm=True
)
stags = collector.fit_transform(df.text)
random.sample(stags, 3)

100%|██████████| 892713/892713 [13:08<00:00, 1132.27it/s]
100%|██████████| 892713/892713 [00:01<00:00, 483704.82it/s]


['', 'code', 'py']

In [11]:
df['stags'] = stags
df['stags'] = df.stags.fillna('NaN')
df = df[['text', 'tags', 'stags', 'source']].copy()
examine_df(df)

shape=(892713, 4)


Unnamed: 0_level_0,name,text,tags,stags,source
Unnamed: 0_level_1,pos,0,1,2,3
head,0,load a Python source file and compile it to by...,,file,docstring
head,1,load a module without importing it DCNL _load_...,,file,docstring
head,2,Compare two code-objects. DCNL This is the mai...,,,docstring
head,3,Compare a .pyc with a source code file.,,,docstring
head,4,Find globals in this statement.,,,docstring
tail,892708,There are two horses on a chess board and four...,easy python,,stepik
tail,892709,Fizz Buzz is a classic programming problem. He...,medium python,output,stepik
tail,892710,"Find the number of ""Ds"", ""Cs"", ""Bs"" and ""As"" f...",hard python,number program single-line,stepik
tail,892711,Sum all input numbers in one pass func.,,,stepik
tail,892712,You are given a string s consisting of n lower...,,string,stepik


In [12]:
print(collector._top_stags[:100])

['code', 'python', 'file', 'python-script', 'list', 'error', 'following-code', 'data', 'following-error', 'looks-like', 'function', 'django', 'best-way', 'like', 'better-way', 'python-code', 'use-code', 'use', 'works-fine', 'script', 'greatly-appreciated', 'text-file', 'csv-file', 'string', 'error-message', 'user', 'py', 'run', 'class', 'value', 'code-function', 'output', 'values', 'py-file', 'server', 'array', 'command-line', 'files', 'look-like', 'image', 'code-method', 'object', 'efficient-way', 'time', 'program', 'python-program', 'dictionary', 'create', 'line', 'numpy-array', 'app', 'code-file', 'module', 'method', 'text', 'pandas-dataframe', 'set', 'model', 'pythonic-way', 'use-python', 'tried', 'test', 'example', 'run-code', 'txt-file', 'form', 'dataframe', 'running', 'database', 'easy-way', 'pandas', 'problem', 'going-wrong', 'page', 'key', 'read', 'html', 'number', 'column', 'table', 'work', 'working-fine', 'api', 'plot', 'source-code', 'url', 'right-direction', 'numpy', 'foll

## Aftermath

In [13]:
args.shuffle = True

In [14]:
if args.shuffle:
    df = df.sample(frac=1).reset_index(drop=True)
examine_df(df, exclude='meta, math')

shape=(892713, 4)


Unnamed: 0_level_0,name,text,tags,stags,source
Unnamed: 0_level_1,pos,0,1,2,3
head,0,python forloop accessing GPIO pins DCNL I'm ne...,gpio python raspbian,code,stackoverflow
head,1,Configure the twisted mainloop to be run insid...,,,docstring
head,2,Analyzes the context that a completion is made...,,,docstring
head,3,Remember a single drawable tuple to paint later.,,,docstring
head,4,Using Scipy.optimize method='SLSQP' returns in...,numpy optimization python scipy,,stackoverflow
tail,892708,Regular expression forums in python with examp...,forums python regex,regular-expressions,stackoverflow
tail,892709,"KNN with TF-IDF Throwing ""Reshape your data"" W...",knn python,following-error,stackoverflow
tail,892710,Legacy initialization method. DCNL Parameters ...,,,docstring
tail,892711,Display Base Layers folder in the Map\'s Layer...,,,docstring
tail,892712,SQLAlchemy query filter behavior confusing in ...,mysql python sqlalchemy,query,stackoverflow


## Save

In [15]:
args.save_path = 'all_data.hdf5'

In [16]:
%time df.to_hdf(args.save_path, 'df', mode='w', format='f', complevel=9)

CPU times: user 2.02 s, sys: 764 ms, total: 2.79 s
Wall time: 3.38 s


In [17]:
!du -sh $args.save_path

427M	all_data.hdf5


## End

In [18]:
args

{'data_spec': ['docstring', 'stackoverflow', 'stepik'],
 'log_step': 5000,
 'save_path': 'all_data.hdf5',
 'shuffle': True,
 'stags_total_up': 500}