In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold

![nlp](https://user-images.githubusercontent.com/74188336/140924469-f7292676-d422-4871-98d3-59ce395e6e07.jpeg)

# Lets import NLTK

In [None]:
import nltk
nltk.download('stopwords')

# What is stemming?

Stemming is the process of reducing inflection in words to their root forms such as mapping a group of words to the same stem even if the stem itself is not a valid word in the Language.

Stem (root) is the part of the word to which you add inflectional (changing/deriving) affixes such as (-ed,-ize, -s,-de,mis). So stemming a word or sentence may result in words that are not actual words. Stems are created by removing the suffixes or prefixes used with a word.



# Example of stemming

![stemming](https://user-images.githubusercontent.com/74188336/140915632-b20d6c02-86c6-41bb-a2fb-6f8abc8d19bd.png)

# Let's See the action of a stemmer

We will be using SnwoballStemmer for Stemming from the NLTK library.

You can also use other stememr like PorterStemmer, etc.

# Import SnowballStemmer

In [None]:
from nltk.stem import SnowballStemmer

### Available Languages.

Stememr are not available for all languages. 
So, Let us check out the languages which are available.

In [None]:
SnowballStemmer.languages

In [None]:
stemmer = SnowballStemmer('english')

### Let us check out how good it is.

We will take the words in the image show above.


In [None]:
stemmer.stem('playing')

In [None]:
stemmer.stem('plays')

In [None]:
stemmer.stem('played')

### Great!! It performed perfectly

### For stopwords!!!

Stopwords are the most frequent words in a sentence.

Those include 'have', 'is', 'are' etc.

But sometimes, stemming the stopwords drive the meaning of a sentence in a completely different direction. Depending on the problem statement we can either ignore stemming the stopwords or stem them. But according to this problem statement, since the severity of a toxic comment depends on vulgar keywords, we might not be using stopwords at all :)

But still let us see how a stemmer ignore stopwords. It's pretty simple :)

In [None]:
stemmer_ignore_stopwords = SnowballStemmer('english', ignore_stopwords=True)

#### Stemming with the generalized stemmer

In [None]:
stemmer.stem('having')

The output is stemmed perfectly

#### Now lets use the stemmer that ignore stopwords

In [None]:
stemmer_ignore_stopwords.stem('having')

Great our problem is now solved !!! :D

# What is Lemmatization?

Lemmatization, unlike Stemming, reduces the inflected words properly ensuring that the root word belongs to the language. In Lemmatization root word is called Lemma. A lemma (plural lemmas or lemmata) is the canonical form, dictionary form, or citation form of a set of words.

For example, runs, running, ran are all forms of the word run, therefore run is the lemma of all these words. Because lemmatization returns an actual word of the language, it is used where it is necessary to get valid words.


We will be using **WordNetLemmatizer** for Lemmatizing

#### Let's download the 'wordnet' corpora

In [None]:
nltk.download('wordnet')

In [None]:
from nltk.stem import WordNetLemmatizer

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
lemmatizer.lemmatize('feet')

### Now lets clean the data by removing unnecessary symbols and stopwords

In [None]:
df = pd.read_csv('../input/toxic-comments/train.csv')

In [None]:
df.head(2)

In [None]:
print(df['comment_text'].values[0])

### Printing shows that there are not unnecessary symbols

In [None]:
df['comment_text'].values[0]

#### But considering the array, we can see there are some escape sequences like '\n'. 

#### Also, we won't be needing any digits neither will be needing those stopwords.

In [None]:
import re
from nltk.corpus import stopwords
from tqdm import tqdm

# creating a corpus with all the comments
corpus = []
df = df[:100]
for i in tqdm(range(len(df))):
    comment = re.sub('[^a-zA-Z]', ' ', df['comment_text'][i])
    comment = comment.lower()
    comment = comment.split()
    stemmer = SnowballStemmer('english')
    lemmatizer = WordNetLemmatizer()
    all_stopwords = stopwords.words('english')
    comment = [stemmer.stem(word) for word in comment if not word in set(all_stopwords)]
    comment = [lemmatizer.lemmatize(word) for word in comment]
    comment = ' '.join(comment)
    corpus.append(comment)

In [None]:
corpus[0]

In [None]:
df['comment_text'].values[0]

### We can see how much the sentence has changed

# But why do we use Stemming and Lemmatizing and remove stopwords?

#### Well to answer that, let us see the number of unique words before and after the process

I have taken only the first 100 samples of the dataset

In [None]:
len(set(np.hstack([sentence.split() for sentence in df['comment_text'].values])))

In [None]:
len(set(np.hstack([sentence.split() for sentence in corpus])))

## Wow Almost Half!!!!

That will be enough to answer the question.

This reduces the number of unique words and hence reduces the dimensionality of the problem.

But **Not Always**. <br>
Some problems require the use of stopwords. For example excluding not from a sentence changes the sentiment of that sentence.

### Let's see a bad case of removing stopwords

In [None]:
comment = 'The food is not good'
comment = re.sub('[^a-zA-Z]', ' ', comment)
comment = comment.lower()
comment = comment.split()
stemmer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()
all_stopwords = stopwords.words('english')
comment = [stemmer.stem(word) for word in comment if not word in set(all_stopwords)]
comment = [lemmatizer.lemmatize(word) for word in comment]
comment = ' '.join(comment)
comment

### Complete opposite! :(
The original sentence was of negative sentiment : **'The food is not good'**

The output : **'food good'**

# Lets clean out data and save it as a Dataset for others to use :)

In [None]:
df = pd.read_csv('../input/toxic-comments/train.csv')

In [None]:
## Washing machine
corpus=[]
for i in tqdm(range(len(df))):
    comment = re.sub('[^a-zA-Z]', ' ', df['comment_text'][i])
    comment = comment.lower()
    comment = comment.split()
    stemmer = SnowballStemmer('english')
    lemmatizer = WordNetLemmatizer()
    all_stopwords = stopwords.words('english')
    comment = [stemmer.stem(word) for word in comment if not word in set(all_stopwords)]
    comment = [lemmatizer.lemmatize(word) for word in comment]
    comment = ' '.join(comment)
    corpus.append(comment)

In [None]:
df['cleaned_comment'] = corpus

In [None]:
df.head()

#### lets create a csv of cleaned comments

In [None]:
df.to_csv('cleaned_train.csv', index=False)

### function for generalization

In [None]:
def washing_machine(df):
    corpus=[]
    for i in tqdm(range(len(df))):
        comment = re.sub('[^a-zA-Z]', ' ', df['comment_text'][i])
        comment = comment.lower()
        comment = comment.split()
        stemmer = SnowballStemmer('english')
        lemmatizer = WordNetLemmatizer()
        all_stopwords = stopwords.words('english')
        comment = [stemmer.stem(word) for word in comment if not word in set(all_stopwords)]
        comment = [lemmatizer.lemmatize(word) for word in comment]
        comment = ' '.join(comment)
        corpus.append(comment)
    df['cleaned_comment'] = corpus
    return df

In [None]:
# cleaned_df = washing_machine(df)