# Stackoverflow data

## Init

### Imports

In [None]:
import os
import re
import nltk
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from hd_utils import monitor
from nltk.corpus import stopwords
from html.parser import HTMLParser
from sklearn.datasets.lfw import Bunch
from html_preprocessing import HTMLPreprocessor

### Plot

In [None]:
%matplotlib inline
sns.set_style('darkgrid')

### Args

In [None]:
args = Bunch()

## Read

In [None]:
args.data_kaggle_name = 'pythonquestions'
args.len_up = 10000

In [None]:
!kaggle datasets download -d stackoverflow/$args.data_kaggle_name

In [None]:
_base_path = f'~/.kaggle/datasets/stackoverflow/{args.data_kaggle_name}'
questions_path = os.path.join(_base_path, 'Questions.csv')
tags_path = os.path.join(_base_path, 'Tags.csv')

In [None]:
%time qdf = pd.read_csv(questions_path, encoding='ISO-8859-1')
qdf.shape

In [None]:
qdf.head()

In [None]:
qdf.drop(['OwnerUserId', 'CreationDate', 'Score'], axis=1, inplace=True)
qdf.rename({c: c.lower() for c in qdf.columns}, axis=1, inplace=True)
qdf = qdf[qdf.body.map(len) <= args.len_up].copy()
qdf.head()

In [None]:
qdf.title.map(len).hist(bins=75);

In [None]:
qdf.body.map(len).hist(bins=75);

## Text

In [None]:
args.log_step = 5000

In [None]:
sample = qdf.body.sample().iloc[0]
sample, HTMLPreprocessor(tqdm=False).fit_transform([sample])[0]

In [None]:
monitor = monitor('[1.6.1] Bodies filtering', args.log_step, len(qdf))
preprocessor = HTMLPreprocessor(monitor)
bodies = preprocessor.fit_transform(qdf.body)

In [None]:
text = [t + ' DCNL ' + b for t, b in zip(qdf.title, bodies)]
df = pd.DataFrame(text, columns=['text'])
df.head()

## Tags

In [None]:
tdf = pd.read_csv(tags_path, encoding='latin1')
tdf.shape

In [None]:
tdf.head()

In [None]:
tdf.rename({c: c.lower() for c in tdf.columns}, axis=1, inplace=True)
tdf = tdf[tdf.id != list(set(tdf.id.unique()) - set(qdf.id))[0]].copy()
tdf.head()

In [None]:
len(tdf.tag.unique())

In [None]:
tdf.tag.value_counts()[:20]

In [None]:
@monitor('[1.6.2] Join tags', args.log_step, len(tdf))
def join_tags(df, *, log):
    df = df.copy()
    tags = [[] for _ in range(len(df))]
    id2i = dict(zip(qdf.id, qdf.index))
    for i, (_, (id_, tag)) in enumerate(tqdm(tdf.iterrows(), total=len(tdf))):
        tags[id2i[id_]].append(tag)
        log.istep(i)
    df['tags'] = [' '.join(sorted(id_tags)) for id_tags in tags]
    return df

In [None]:
df = join_tags(df)
del qdf
del tdf
df.shape

In [None]:
df.head()

## Save

In [None]:
args.data_path = 'stackoverflow_data.hdf5'

In [None]:
%time df.to_hdf(args.save_path, 'df', mode='w', format='t', complevel=9)

In [None]:
!du -sh $args.save_path

## End

In [None]:
args