# Common

In [None]:
import numpy as np
import pandas as pd
import random
import torch
from torchtext.legacy import data
import os

%matplotlib inline 
from IPython.core.interactiveshell import InteractiveShell
from IPython import get_ipython
get_ipython().ast_node_interactivity = 'all'

CONFIG = {
    'seed': 23
}

def set_seed(seed=42):
    '''Sets seed so result unchanged - reproducibility'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(CONFIG['seed'])

##### Make mini trainnig data

In [52]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

%matplotlib inline 
from IPython.core.interactiveshell import InteractiveShell
from IPython import get_ipython
get_ipython().ast_node_interactivity = 'all'

def p(t=''):
    print('-------'+t+'-------')
    print()
    
CONFIG = {'seed': 23
          }

##### READ data

In [53]:
df = pd.read_csv("../data/raw/jigsaw-toxic-comment-classification-challenge/train.csv")
df['y'] = (df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) > 0 ).astype(int)
df = df[['comment_text', 'y']].rename(columns={'comment_text': 'text'})
df.sample(5)

Unnamed: 0,text,y
14795,Queens of the stone age \nHere's just a big th...,0
95894,MySpace?\nIs the MySpace page that is linked a...,0
58770,I would defer to KateFan's judgement on this m...,0
131779,== prospider is a fag,1
141119,"""\n\nAnd you did that in 8 lines !!! Anyway, ""...",0


In [54]:
list(df['y'].unique())

[0, 1]

In [63]:
def random_balance_subset(df, n=2000):
    """select random n sample for each class

    Args:
        df (df): with y label as e.g. [0, 1]
        n (int): num
    """
    list_of_dataframes = []
    ys = list(df['y'].unique())
    for y in ys:
        subsample = df[df['y'] == y].sample(n=n, random_state=CONFIG['seed'])
        list_of_dataframes.append(subsample)
    res = pd.concat(list_of_dataframes)
    return res
df_mini = random_balance_subset(df, 2000)
df_mini['y'].value_counts()
df_mini = df_mini.rename({'y': 'label'}, axis=1)  # new method
df_mini.head()
df_mini.reset_index(drop=True, inplace=True)
df_mini.head()


0    2000
1    2000
Name: y, dtype: int64

Unnamed: 0,text,label
32504,"Oppose For the sake of this decision, I don't ...",0
39965,REDIRECT Talk:Shabab Al-Bireh Institute,0
128463,Rutherford was a supporter of the Haultain gov...,0
66224,I didn't do it \n\nI didn't add improperly cit...,0
65530,"""Hang on a minute, scobey. I'm Irish. I'd just...",0


Unnamed: 0,text,label
0,"Oppose For the sake of this decision, I don't ...",0
1,REDIRECT Talk:Shabab Al-Bireh Institute,0
2,Rutherford was a supporter of the Haultain gov...,0
3,I didn't do it \n\nI didn't add improperly cit...,0
4,"""Hang on a minute, scobey. I'm Irish. I'd just...",0


In [64]:
df_mini

Unnamed: 0,text,label
0,"Oppose For the sake of this decision, I don't ...",0
1,REDIRECT Talk:Shabab Al-Bireh Institute,0
2,Rutherford was a supporter of the Haultain gov...,0
3,I didn't do it \n\nI didn't add improperly cit...,0
4,"""Hang on a minute, scobey. I'm Irish. I'd just...",0
...,...,...
3995,", 26 April 2015 (UTC)\nnote restored stop fuck...",1
3996,"Hello, you turd. \n\nFirśt, I aṃ ģoinģ ţo ţie ...",1
3997,You stupid nigger \n\nYou stupid nigger You st...,1
3998,fuckbook \n\nit sucks hairy balls,1


In [65]:
df_mini.to_csv('train_mini.csv', index=False, encoding='utf-8')

In [70]:
df['text'].isnull().sum(axis = 0)

0