In [None]:
!pip install iterative-stratification -q

In [None]:
import pandas as pd
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import nltk
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from tqdm import tqdm
import re
from nltk.corpus import stopwords

In [None]:
df = pd.read_csv('../input/toxic-comments/train.csv')

In [None]:
df.columns

In [None]:
def washing_machine(comments):
    corpus=[]
    for i in tqdm(range(len(comments))):
        comment = re.sub('[^a-zA-Z]', ' ', comments[i])
        comment = comment.lower()
        comment = comment.split()
        stemmer = SnowballStemmer('english')
        lemmatizer = WordNetLemmatizer()
        all_stopwords = stopwords.words('english')
        comment = [stemmer.stem(word) for word in comment if not word in set(all_stopwords)]
        comment = [lemmatizer.lemmatize(word) for word in comment]
        comment = ' '.join(comment)
        corpus.append(comment)

    return corpus

In [None]:
df['cleaned_comment_text'] = washing_machine(df['comment_text'].values)

In [None]:
 def create_folds(data, num_splits):
    data.loc[:,'kfold'] = -1
    X = data['cleaned_comment_text']
    y = data[['toxic', 'severe_toxic', 'obscene', 'threat',
           'insult', 'identity_hate']]
    mskf = MultilabelStratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)
    
    for fold, (trn_, val_) in enumerate(mskf.split(X,y)):
        data.loc[val_,'kfold'] = fold
        
    return data

In [None]:
df_5 = create_folds(df.copy(), 5)
df_10 = create_folds(df.copy(), 10)

In [None]:
df_5.head()

In [None]:
df_5['kfold'].value_counts()

In [None]:
df_10['kfold'].value_counts()

In [None]:
df_5.to_csv('5folds.csv', index=False)
df_10.to_csv('10folds.csv', index=False)