In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import nltk

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
#Reading the data

data = pd.read_csv("/kaggle/input/sms-spam-collection-dataset/spam.csv",encoding='latin-1')
data.head()

In [None]:
#Removing the columns that are not needed

data = data.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
data = data.rename(columns={"v1":"label", "v2":"body_text"})

In [None]:
data.describe()

In [None]:
data.groupby("label").describe()

In [None]:
#The shape of the dataset

print("Input data has {} rows and {} columns".format(len(data), len(data.columns)))

In [None]:
#How many spam/ham are there

print("Out of the total {} rows, {} are spam, {} are ham".format(len(data),
                                                       len(data[data['label']=='spam']),
                                                       len(data[data['label']=='ham'])))

In [None]:
data.info()

In [None]:
#How much missing data is there

print("Number of null in label: {}".format(data['label'].isnull().sum()))
print("Number of null in text: {}".format(data['body_text'].isnull().sum()))

# **CLEANING THE TEXT**

# Removing punctuation and numbers to avoid confusion.

In [None]:
import string
string.punctuation

In [None]:
def remove_punct_num(text):
    text_nopunct= "".join([char for char in text if char not in string.punctuation])
    text_nonum=''.join([i for i in text_nopunct if not i.isdigit()])
    return text_nonum

data['body_text_clean'] = data['body_text'].apply(lambda x: remove_punct_num(x))

data.head()

# Tokenization

Tokenization is a very common task in NLP, it is basically a task of chopping a character into pieces, called as token, and throwing away the certain characters at the same time, like punctuation. 

In [None]:
import re

In [None]:
def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

data['body_text_clean'] = data['body_text_clean'].apply(lambda x: tokenize(x.lower()))

data.head()

# Removing Stop Words

A stop word is a commonly used word (such as “the”, “a”, “an”, “in”) that a search engine has been programmed to ignore, both when indexing entries for searching and when retrieving them as the result of a search query.

In [None]:
stopword = nltk.corpus.stopwords.words('english')

In [None]:
stopword[0:100:10]

In [None]:
def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stopword]
    return text

data['body_text_clean'] = data['body_text_clean'].apply(lambda x: remove_stopwords(x))

data.head()

# Stemming and Lemmatization

For grammatical reasons, documents are going to use different forms of a word, such as organize, organizes, and organizing. Additionally, there are families of derivationally related words with similar meanings, such as democracy, democratic, and democratization. In many situations, it seems as if it would be useful for a search for one of these words to return documents that contain another word in the set.

Stemming usually refers to a crude heuristic process that chops off the ends of words in the hope of achieving this goal correctly most of the time, and often includes the removal of derivational affixes. Lemmatization usually refers to doing things properly with the use of a vocabulary and morphological analysis of words, normally aiming to remove inflectional endings only and to return the base or dictionary form of a word, which is known as the lemma . If confronted with the token saw, stemming might return just s, whereas lemmatization would attempt to return either see or saw depending on whether the use of the token was as a verb or a noun. The two may also differ in that stemming most commonly collapses derivationally related words, whereas lemmatization commonly only collapses the different inflectional forms of a lemma. 

It comes down to a trade-off of speed. Stemming takes less times, but takes less grammatical approach, hence sometimes gives errors. 
Lemmatization takes more time, but is usually more accurate.

# Porter Stemmer

In [None]:
ps = nltk.PorterStemmer()
ps

In [None]:
print(ps.stem('grows'))
print(ps.stem('growing'))
print(ps.stem('grow'))

In [None]:
print(ps.stem('run'))
print(ps.stem('running'))
print(ps.stem('runner'))

In [None]:
print(ps.stem("fast"))
print(ps.stem("fasting"))
print(ps.stem("fastest"))

In [None]:
#Stemming our data

def stemming(input_text):
    text = [ps.stem(word) for word in input_text]
    return text

data['body_text_stemmed'] = data['body_text_clean'].apply(lambda x: stemming(x))

data.head(10)

# WordNet lemmatizer

In [None]:
wn = nltk.WordNetLemmatizer()
wn

In [None]:
print(ps.stem('meanness'))
print(ps.stem('meaning'))

In [None]:
print(wn.lemmatize('meanness'))
print(wn.lemmatize('meaning'))

In [None]:
print(ps.stem('thinking'))
print(ps.stem('thinker'))

In [None]:
print(wn.lemmatize('thinking'))
print(wn.lemmatize('thinker'))

In [None]:
def lemmatizing(input_text):
    text = [wn.lemmatize(word) for word in input_text]
    return text

data['body_text_lemmatized'] = data['body_text_clean'].apply(lambda x: lemmatizing(x))

data.head(10)

# Data Vectorization

Let us take our lemmatized text for the next step.

In [None]:
data_vector= data[["label","body_text_lemmatized"]]

In [None]:
data_vector.head()

In [None]:
len(data_vector)

# **Count Vectorization**

In [None]:
for i in range(0,5572):
    st=data_vector["body_text_lemmatized"][i]
    new_st=" ".join(st)
    data_vector["body_text_lemmatized"][i]=new_st

In [None]:
data_vector.head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()

X_counts = count_vect.fit_transform(data_vector["body_text_lemmatized"])

In [None]:
print(X_counts.shape)

In [None]:
print(count_vect.get_feature_names())

In [None]:
X_counts_df = pd.DataFrame(X_counts.toarray())
X_counts_df

# N-Gram Vectorization

In [None]:
ngram_vect = CountVectorizer(analyzer='word', ngram_range=(2, 2))

In [None]:
X_counts2 = ngram_vect.fit_transform(data_vector["body_text_lemmatized"])

In [None]:
print(X_counts2.shape)

In [None]:
print(ngram_vect.get_feature_names())

In [None]:
X_counts_df2 = pd.DataFrame(X_counts2.toarray())
X_counts_df2

# TF-IDF (Inverse Document Frequency)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer()

In [None]:
X_tfidf = tfidf_vect.fit_transform(data_vector["body_text_lemmatized"])

In [None]:
print(X_tfidf.shape)

In [None]:
print(tfidf_vect.get_feature_names())

In [None]:
idf_df=pd.DataFrame(X_tfidf.toarray())
idf_df