<a href="https://colab.research.google.com/github/spencerduberry/Python_csv/blob/main/Stemming.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
# prompt: import pandas as pd

import pandas as pd
from collections import (
    Counter,
)  # collections provides specialised data structures; counter tracks how mamny times items appear in a sequence
import re  # allows handling of text data based on patterns rather than exact characters
import nltk  # natural language toolkit
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer  # stemming library
from nltk.tokenize import word_tokenize

nltk.download("stopwords")
nltk.download(
    "punkt"
)  # punkt is a model for sentence tokenisation and is good at identifying sentence boundaries
stop_words = set(stopwords.words("english"))  # set of stopwords is created
stemmer = PorterStemmer()  # initialise stemmer

df = pd.read_csv("spam.csv", encoding="latin-1")

df["v1"] = df["v1"].apply(lambda x: 1 if x == "spam" else 0)  # binarising ham and spam


def tokenize(text):  # def is a function, which is like a method, but classless
    text = text.lower()  # conversion to lower class so that upper and lower class versions of the same word are not counted separately
    text = re.sub(r"[^a-z\s]", "", text)  # removes non-alphabetical characters
    return text.split()  # splits text into words


def remove_stopwords(text):
    words = word_tokenize(text)  # the complete set of words
    filtered_words = [
        word for word in words if word.lower() not in stop_words
    ]  # the filtered set of words: in the new list, include words from the words set as long as they are not in the stop_words set
    return " ".join(
        filtered_words
    )  # puts the words back together into a string. The ' ' is just the delimiter


def stem_words(text):
    words = word_tokenize(text)
    stemmed_words = [
        stemmer.stem(word) for word in words
    ]  # analagous to the above function
    return " ".join(stemmed_words)


df["cleaned_text"] = (
    df["v2"].apply(remove_stopwords).apply(stem_words)
)  # creates a new cleaned_text attribute, removing stopwords

print(df[["v1", "v2", "cleaned_text"]].head())

spam_docs = df[df["v1"] == 1][
    "cleaned_text"
]  # creates a new document of only spam, via filtering
ham_docs = df[df["v1"] == 0][
    "cleaned_text"
]  # creates a new document of only ham, via filtering

spam_text = " ".join(
    spam_docs
)  # combines all spam and ham messages into individual documents
spam_tokens = tokenize(spam_text)  # and tokenises them
ham_text = " ".join(ham_docs)
ham_tokens = tokenize(ham_text)

spam_term_index = Counter(spam_tokens)  # create an index of terms
ham_term_index = Counter(ham_tokens)

print("Most common terms in spam messages:")
print(spam_term_index.most_common(20))

print("\nMost common terms in ham messages:")
print(ham_term_index.most_common(20))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


   v1                                                 v2  \
0   0  Go until jurong point, crazy.. Available only ...   
1   0                      Ok lar... Joking wif u oni...   
2   1  Free entry in 2 a wkly comp to win FA Cup fina...   
3   0  U dun say so early hor... U c already then say...   
4   0  Nah I don't think he goes to usf, he lives aro...   

                                        cleaned_text  
0  go jurong point , crazi .. avail bugi n great ...  
1                      ok lar ... joke wif u oni ...  
2  free entri 2 wkli comp win fa cup final tkt 21...  
3        u dun say earli hor ... u c alreadi say ...  
4         nah n't think goe usf , live around though  
Most common terms in spam messages:
[('call', 369), ('free', 221), ('txt', 170), ('u', 167), ('ur', 144), ('text', 140), ('mobil', 136), ('stop', 119), ('claim', 115), ('repli', 112), ('prize', 94), ('get', 88), ('s', 82), ('p', 79), ('tone', 74), ('nokia', 72), ('new', 72), ('servic', 72), ('send', 70), ('a