# Lab 1: Text preprocessing using basic python

In [3]:
import string
sample_text = " Natural Language Processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language. It's used to analyze text, allowing machines to understand, interpret, and manipulate human language. NLP has many real-world applications, including machine translation, sentiment analysis, and chatbots. babies, companies "

In [4]:
print('Original Text:', sample_text)

Original Text:  Natural Language Processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language. It's used to analyze text, allowing machines to understand, interpret, and manipulate human language. NLP has many real-world applications, including machine translation, sentiment analysis, and chatbots. babies, companies 


### 1. Tokenization: split the text into individual words (tokens)

In [5]:
tokens = sample_text.split()
tokens

['Natural',
 'Language',
 'Processing',
 '(NLP)',
 'is',
 'a',
 'subfield',
 'of',
 'linguistics,',
 'computer',
 'science,',
 'and',
 'artificial',
 'intelligence',
 'concerned',
 'with',
 'the',
 'interactions',
 'between',
 'computers',
 'and',
 'human',
 'language.',
 "It's",
 'used',
 'to',
 'analyze',
 'text,',
 'allowing',
 'machines',
 'to',
 'understand,',
 'interpret,',
 'and',
 'manipulate',
 'human',
 'language.',
 'NLP',
 'has',
 'many',
 'real-world',
 'applications,',
 'including',
 'machine',
 'translation,',
 'sentiment',
 'analysis,',
 'and',
 'chatbots.',
 'babies,',
 'companies']

### 2. Lowercasing: Convert all tokens to lowercase.

In [6]:
lowercase_tokens = [i.lower() for i in tokens]
lowercase_tokens

['natural',
 'language',
 'processing',
 '(nlp)',
 'is',
 'a',
 'subfield',
 'of',
 'linguistics,',
 'computer',
 'science,',
 'and',
 'artificial',
 'intelligence',
 'concerned',
 'with',
 'the',
 'interactions',
 'between',
 'computers',
 'and',
 'human',
 'language.',
 "it's",
 'used',
 'to',
 'analyze',
 'text,',
 'allowing',
 'machines',
 'to',
 'understand,',
 'interpret,',
 'and',
 'manipulate',
 'human',
 'language.',
 'nlp',
 'has',
 'many',
 'real-world',
 'applications,',
 'including',
 'machine',
 'translation,',
 'sentiment',
 'analysis,',
 'and',
 'chatbots.',
 'babies,',
 'companies']

### 3. Punctuation Removal: Remove all punctuation marks from the tokens.

In [7]:
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [16]:
cleaned_tokens = []

for token in lowercase_tokens:
    clean_token = ''.join([char for char in token if char not in string.punctuation])
    cleaned_tokens.append(clean_token)

cleaned_tokens

['natural',
 'language',
 'processing',
 'nlp',
 'is',
 'a',
 'subfield',
 'of',
 'linguistics',
 'computer',
 'science',
 'and',
 'artificial',
 'intelligence',
 'concerned',
 'with',
 'the',
 'interactions',
 'between',
 'computers',
 'and',
 'human',
 'language',
 'its',
 'used',
 'to',
 'analyze',
 'text',
 'allowing',
 'machines',
 'to',
 'understand',
 'interpret',
 'and',
 'manipulate',
 'human',
 'language',
 'nlp',
 'has',
 'many',
 'realworld',
 'applications',
 'including',
 'machine',
 'translation',
 'sentiment',
 'analysis',
 'and',
 'chatbots',
 'babies',
 'companies']

### 4. Stop Word Removal: Remove common stop words (e.g., "the", "is", "and").

In [9]:
stop_words = ["the", "a", "an", "in", "on", "at", "for", "to", "of",
            "and", "is", "are", 'but', 'if', 'so']

In [17]:
filtered_tokens = [i for i in cleaned_tokens if i not in stop_words]
filtered_tokens

['natural',
 'language',
 'processing',
 'nlp',
 'subfield',
 'linguistics',
 'computer',
 'science',
 'artificial',
 'intelligence',
 'concerned',
 'with',
 'interactions',
 'between',
 'computers',
 'human',
 'language',
 'its',
 'used',
 'analyze',
 'text',
 'allowing',
 'machines',
 'understand',
 'interpret',
 'manipulate',
 'human',
 'language',
 'nlp',
 'has',
 'many',
 'realworld',
 'applications',
 'including',
 'machine',
 'translation',
 'sentiment',
 'analysis',
 'chatbots',
 'babies',
 'companies']

### 5. Stemming: Reduce words to their root form using a simple algorithm.

In [11]:
# Common suffix rules (ordered by priority)
suffix_rules = [
    ('ational', 'ate'), ('tional', 'tion'),
    ('enci', 'ence'), ('anci', 'ance'),
    ('izer', 'ize'), ('ator', 'ate'),
    ('alli', 'al'), ('entli', 'ent'), ('eli', 'e'),
    ('ousli', 'ous'),
    ('ization', 'ize'), ('ation', 'ate'),
    ('fulness', 'ful'), ('ousness', 'ous'), ('iveness', 'ive'),
    ('ing', ''), ('ed', ''), ('er', ''), ('est', ''), ('ly', ''),
    ('ment', ''), ('ness', ''), ('tion', ''), ('sion', ''),
    ('able', ''), ('ible', ''), ('al', ''), ('ful', ''),
    ('ous', ''), ('ive', ''), ('ic', ''),
    ('ies', 'y'), ('s', ''),
]

In [18]:
stemmed_tokens = []
for token in filtered_tokens:
    if len(token) > 3:
        stemmed = token
        for suffix, replacement in suffix_rules:
            if stemmed.endswith(suffix):
                stemmed = stemmed[:-len(suffix)] + replacement
                break

        if len(stemmed) >= 2:
            stemmed_tokens.append(stemmed)
        else:
            stemmed_tokens.append(token)
    else:
        stemmed_tokens.append(token)

stemmed_tokens

['natur',
 'language',
 'process',
 'nlp',
 'subfield',
 'linguistic',
 'comput',
 'science',
 'artifici',
 'intelligence',
 'concern',
 'with',
 'interaction',
 'between',
 'computer',
 'human',
 'language',
 'its',
 'us',
 'analyze',
 'text',
 'allow',
 'machine',
 'understand',
 'interpret',
 'manipulate',
 'human',
 'language',
 'nlp',
 'has',
 'many',
 'realworld',
 'application',
 'includ',
 'machine',
 'translate',
 'senti',
 'analysi',
 'chatbot',
 'baby',
 'company']

### 6. lemmatization: map common words to their base form (e.g., "is" -> "be", "are" -> "be").

In [13]:
lemma_dict = {
    'am': 'be', 'is': 'be', 'are': 'be', 'was': 'be', 'were': 'be', 'been': 'be', 'being': 'be',
    'better': 'good', 'best': 'good', 'worse': 'bad', 'worst': 'bad',

    "allowing": 'allow',
    'including': 'include',
    "processing": 'process',
    'used': 'use' ,
    'concerned': 'concern',
}

In [19]:
lemmatized_tokens = []
for token in filtered_tokens:
    if token in lemma_dict:
        lemmatized_tokens.append(lemma_dict[token])
    else:
        lemmatized_tokens.append(token)

lemmatized_tokens

['natural',
 'language',
 'process',
 'nlp',
 'subfield',
 'linguistics',
 'computer',
 'science',
 'artificial',
 'intelligence',
 'concern',
 'with',
 'interactions',
 'between',
 'computers',
 'human',
 'language',
 'its',
 'use',
 'analyze',
 'text',
 'allow',
 'machines',
 'understand',
 'interpret',
 'manipulate',
 'human',
 'language',
 'nlp',
 'has',
 'many',
 'realworld',
 'applications',
 'include',
 'machine',
 'translation',
 'sentiment',
 'analysis',
 'chatbots',
 'babies',
 'companies']