# **Text Preprocessing with Basic Python**

**1. Input text and Tokenization**

In [None]:
text = """Natural Language Processing (NLP) is a subfield of linguistics, computer science, and
artificial intelligence concerned with the interactions between computers and human language.
It's used to analyze text, allowing machines to understand, interpret, and manipulate human language.
NLP has many real-world applications, including machine translation, sentiment analysis, and chatbots."""

text_token = text.split()
print("Step 1 - Tokenization", text_token)

Step 1 - Tokenization ['Natural', 'Language', 'Processing', '(NLP)', 'is', 'a', 'subfield', 'of', 'linguistics,', 'computer', 'science,', 'and', 'artificial', 'intelligence', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', 'language.', "It's", 'used', 'to', 'analyze', 'text,', 'allowing', 'machines', 'to', 'understand,', 'interpret,', 'and', 'manipulate', 'human', 'language.', 'NLP', 'has', 'many', 'real-world', 'applications,', 'including', 'machine', 'translation,', 'sentiment', 'analysis,', 'and', 'chatbots.']


**2. Lowercasing**

In [None]:
token_lower = [i.lower() for i in text_token]
print("Step 2 - Lowercasing", token_lower)

Step 2 - Lowercasing ['natural', 'language', 'processing', '(nlp)', 'is', 'a', 'subfield', 'of', 'linguistics,', 'computer', 'science,', 'and', 'artificial', 'intelligence', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', 'language.', "it's", 'used', 'to', 'analyze', 'text,', 'allowing', 'machines', 'to', 'understand,', 'interpret,', 'and', 'manipulate', 'human', 'language.', 'nlp', 'has', 'many', 'real-world', 'applications,', 'including', 'machine', 'translation,', 'sentiment', 'analysis,', 'and', 'chatbots.']


**3. Punctuation Removal**

In [None]:
import string

token_no_punct = [token.translate(str.maketrans('', '', string.punctuation)) for token in token_lower]
print("Step 3 - Punctuation removal:", token_no_punct)

Step 3 - Punctuation removal: ['natural', 'language', 'processing', 'nlp', 'is', 'a', 'subfield', 'of', 'linguistics', 'computer', 'science', 'and', 'artificial', 'intelligence', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', 'language', 'its', 'used', 'to', 'analyze', 'text', 'allowing', 'machines', 'to', 'understand', 'interpret', 'and', 'manipulate', 'human', 'language', 'nlp', 'has', 'many', 'realworld', 'applications', 'including', 'machine', 'translation', 'sentiment', 'analysis', 'and', 'chatbots']


**4. Stop Word Removal**

In [None]:
stop_words = ["the", "a", "an", "in", "on", "at", "for", "to", "of", "and", "is", "are"]

token_no_stopwords = [token for token in token_no_punct if token not in stop_words]

print("Step 4 - Stop Word Removal:", token_no_stopwords)

Step 4 - Stop Word Removal: ['natural', 'language', 'processing', 'nlp', 'subfield', 'linguistics', 'computer', 'science', 'artificial', 'intelligence', 'concerned', 'with', 'interactions', 'between', 'computers', 'human', 'language', 'its', 'used', 'analyze', 'text', 'allowing', 'machines', 'understand', 'interpret', 'manipulate', 'human', 'language', 'nlp', 'has', 'many', 'realworld', 'applications', 'including', 'machine', 'translation', 'sentiment', 'analysis', 'chatbots']


**5. Stemming**

In [None]:
def simple_stem(token):
    for suffix in ['ing', 'ed', 's']:
        if token.endswith(suffix):
            return token[:-len(suffix)]
    return token

stemmed_tokens = [simple_stem(token) for token in token_no_stopwords]

print("Step 5 - Stemming", stemmed_tokens)

Step 5 - Stemming ['natural', 'language', 'process', 'nlp', 'subfield', 'linguistic', 'computer', 'science', 'artificial', 'intelligence', 'concern', 'with', 'interaction', 'between', 'computer', 'human', 'language', 'it', 'us', 'analyze', 'text', 'allow', 'machine', 'understand', 'interpret', 'manipulate', 'human', 'language', 'nlp', 'ha', 'many', 'realworld', 'application', 'includ', 'machine', 'translation', 'sentiment', 'analysi', 'chatbot']


**6. Bonus Step: Lemmatization**

In [None]:
lemma_dict = {
     "is": "be",
    "are": "be",
    "was": "be",
    "were": "be",
    "has": "have",
    "had": "have",
    "does": "do",
    "did": "do",
    "machines": "machine",
    "computers": "computer",
    "interactions": "interaction",
    "languages": "language",
    "used": "use",
    "allowing": "allow",
    "applications": "application",
    "including": "include",
}

lemmatized_tokens = [lemma_dict.get(token, token) for token in stemmed_tokens]

print("Bonus Step - Lemmatized Tokens:", lemmatized_tokens)

Bonus Step - Lemmatized Tokens: ['natural', 'language', 'process', 'nlp', 'subfield', 'linguistic', 'computer', 'science', 'artificial', 'intelligence', 'concern', 'with', 'interaction', 'between', 'computer', 'human', 'language', 'it', 'us', 'analyze', 'text', 'allow', 'machine', 'understand', 'interpret', 'manipulate', 'human', 'language', 'nlp', 'ha', 'many', 'realworld', 'application', 'includ', 'machine', 'translation', 'sentiment', 'analysi', 'chatbot']
