In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
import nltk
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from tqdm import tqdm
import re
from nltk.corpus import stopwords

In [None]:
def create_folds(df, num_splits):
    
    skf = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)

    for fold, ( _, val_) in enumerate(skf.split(X=df, y=df.worker)):
        df.loc[val_ , "kfold"] = int(fold)

    df["kfold"] = df["kfold"].astype(int)
    return df.drop('worker', axis=1)

In [None]:
df = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')

In [None]:
df.head(2)

# Cleaning the data.

* Stripping off the symbols (only keeping alphabets)
* Stemming
* Lemmatizing

To learn more about Stemming and Lemmatizing visit : [All about Stemming and Lemmatization + Cleaning ⭐️](https://www.kaggle.com/kishalmandal/all-about-stemming-and-lemmatization-cleaning)

In [None]:
def washing_machine(comments):
    corpus=[]
    for i in tqdm(range(len(comments))):
        comment = re.sub('[^a-zA-Z]', ' ', comments[i])
        comment = comment.lower()
        comment = comment.split()
        stemmer = SnowballStemmer('english')
        lemmatizer = WordNetLemmatizer()
        all_stopwords = stopwords.words('english')
        comment = [stemmer.stem(word) for word in comment if not word in set(all_stopwords)]
        comment = [lemmatizer.lemmatize(word) for word in comment]
        comment = ' '.join(comment)
        corpus.append(comment)

    return corpus

In [None]:
# df['cleaned_less_toxic'] = washing_machine(df['less_toxic'].values)
# df['cleaned_more_toxic'] = washing_machine(df['more_toxic'].values)

In [None]:
df = df[['worker', 'less_toxic', 'more_toxic']]

In [None]:
df.head(2)

# Creating 5 folds and 10 folds

In [None]:
df2 =pd.read_csv('../input/data-augmentation/augmented_text.csv')

In [None]:
df2['kfold'] = [-1]*len(df2)
df2 = df2.sample(frac=1)

In [None]:
df_5 = create_folds(df.copy(), num_splits=5)


In [None]:
df_5 = pd.concat([df_5, df2])


In [None]:
df_5.tail()

In [None]:
df_5.to_csv('5folds.csv', index=False)
