In [1]:
import nltk
import os
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer


In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\win10\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\win10\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\win10\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
data_dir = 'E:\\New folder (2)\\bbc\sport'

In [10]:
def tokenize_file(file_path):
    try:
        with open(file_path, 'r') as file:
            content = file.read()
            tokens = word_tokenize(content)
        return tokens
    except Exception as e:
        print(f"Error reading file '{file_path}': {str(e)}")
        return []


In [11]:
# Step 2: Filtering words based on specific affixes
def filter_words(tokens):
    filtered_words = {
        'ed': [],
        'ing': [],
        'es': [],
        's': [],
        'un': [],
        'in': []
    }
    for word in tokens:
        if word.endswith('ed'):
            filtered_words['ed'].append(word)
        elif word.endswith('ing'):
            filtered_words['ing'].append(word)
        elif word.endswith('es'):
            filtered_words['es'].append(word)
        elif word.endswith('s'):
            filtered_words['s'].append(word)
        elif word.startswith('un'):
            filtered_words['un'].append(word)
        elif word.startswith('in'):
            filtered_words['in'].append(word)
    return filtered_words

In [12]:
# Step 3: Removing stop words
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return filtered_tokens

# Step 4: Stemming words
def stem_words(tokens):
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    return stemmed_tokens

# Step 5: Lemmatizing words
def lemmatize_words(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return lemmatized_tokens

# Step 6: Finding and replacing common abbreviations
def replace_abbreviations(tokens):
    abbreviation_mapping = {
        'i.e.': 'that is',
        'e.g.': 'for example',
        'u.s.': 'United States'
    }
    replaced_tokens = [abbreviation_mapping[word.lower()] if word.lower() in abbreviation_mapping else word for word in tokens]
    abbreviated_words = [word for word in tokens if word.lower() in abbreviation_mapping]
    return replaced_tokens, abbreviated_words

# Step 7: Finding words with special symbols using regular expressions
def find_special_symbols(tokens):
    special_symbols = re.compile(r'[^\w\s]')
    special_words = [word for word in tokens if special_symbols.search(word)]
    return special_words

# Main pipeline function
def process_file(file_path):
    # Tokenization
    tokens = tokenize_file(file_path)
    print(f"File: {os.path.basename(file_path)} - Number of tokens: {len(tokens)}")

    # Filtering words based on specific affixes
    filtered_words = filter_words(tokens)
    for affix, word_list in filtered_words.items():
        print(f"Words ending with '{affix}': {word_list}")

    # Removing stop words
    num_words_before = len(tokens)
    tokens = remove_stopwords(tokens)
    num_words_after = len(tokens)
    print(f"Number of words before stop word removal: {num_words_before}, after: {num_words_after}")

    # Stemming words
    stemmed_tokens = stem_words(tokens)
    print(f"Number of words before stemming: {len(tokens)}, after: {len(stemmed_tokens)}")

    # Lemmatizing words
    lemmatized_tokens = lemmatize_words(tokens)
    print(f"Number of words before lemmatizing: {len(tokens)}, after: {len(lemmatized_tokens)}")

    # Finding and replacing common abbreviations
    replaced_tokens, abbreviated_words = replace_abbreviations(tokens)
    print(f"Abbreviated words: {abbreviated_words}")

    # Finding words with special symbols using regular expressions
    special_words = find_special_symbols(tokens)
    print(f"Words with special symbols: {special_words}")

# Process each file in the dataset
for filename in os.listdir(data_dir):
    if filename.endswith('.txt'):
        file_path = os.path.join(data_dir, filename)
        process_file(file_path)

File: 001.txt - Number of tokens: 235
Words ending with 'ed': ['smashed', 'struggled', 'contested', 're-focused']
Words ending with 'ing': ['hunting', 'setting', 'training', 'trailing', 'preparing', 'moving', 'training']
Words ending with 'es': ['hurdles', 'comes', 'hurdles', 'hurdles']
Words ending with 's': ['is', "'s", 'Championships', 'has', 'this', 'seconds', 'AAAs', 'as', 'As', 'as', 'is', 'has', 'years', 'has', 'success', 'owns', 'this', "'s", 'has', 'previous', 'seasons', 'has', 'attentions', 'pays', 'dividends', 'Indoors']
Words ending with 'un': []
Words ending with 'in': ['in', 'international', 'in', 'in', 'in']
Number of words before stop word removal: 235, after: 150
Number of words before stemming: 150, after: 150
Number of words before lemmatizing: 150, after: 150
Abbreviated words: []
Words with special symbols: ["'s", '.', '25-year-old', ',', '7.96', '.', '``', ',', "''", '.', '``', '.', '``', '.', "''", '.', ',', 'Scotland-born', 'fifth-fastest', '.', "'s", ',', '.', 

File: 021.txt - Number of tokens: 294
Words ending with 'ed': ['suspended', 'issued', 'failed', 'banned', 'finished', 'edged', 'wanted', 'maintained', 'accused', 'threatened', 'feigned', 'missed', 'included', 'failed']
Words ending with 'es': ['races', 'athletes']
Words ending with 's': ["'s", 'athletics', 'has', 'Athletics', 'cross', 'competitions', 'this', 'directors', 'was', 'this', "'s", 'Sports', 'Communications', "'s", 'Cross', 'Championships', 'was', 'as', 'this', 'agents', 'us', 'has', "'s", 'reports', 'cross', 'championships', 'trials', 'was']
Words ending with 'un': ['until']
Words ending with 'in': ['in', 'international', 'in', 'in', 'in', 'in', 'in', 'in', 'injury', 'in', 'in', 'in', 'in', 'injury']
Number of words before stop word removal: 294, after: 186
Number of words before stemming: 186, after: 186
Number of words before lemmatizing: 186, after: 186
Abbreviated words: []
Words with special symbols: ["'s", 'two-time', 'runner-up', '.', '(', ')', '.', '``', ',', "''", '

File: 036.txt - Number of tokens: 249
Words ending with 'ed': ['opened', 'challenged', 'ended', 'released', 'repeated', 'vowed', 'cleared', 'revealed']
Words ending with 'ing': ['doping', 'doping', 'performance-enhancing', 'hoping', 'stating', 'closing', 'Anti-Doping', 'coming', 'issuing']
Words ending with 'es': ['Jones', 'Jones', 'Jacques', 'Laboratories', 'Jones', 'Games', 'Jones', 'moves']
Words ending with 's': ['begins', 'claims', 'has', 'has', 'claims', 'says', 'is', 'medals', 'drugs', 'Olympics', 'was', 'medals', "'s", 'decisions', 'years', 'Olympics', 'years', 'allegations', 'Nichols', "'s", 'allegations', 'as', 'process', 'is', 'is', 'has', 'statements']
Words ending with 'un': ['under']
Words ending with 'in': ['investigation', 'into', 'into', 'innocent', 'interpretation', 'innocence', 'indictment', 'inconsistent']
Number of words before stop word removal: 249, after: 152
Number of words before stemming: 152, after: 152
Number of words before lemmatizing: 152, after: 152
Abb

File: 050.txt - Number of tokens: 289
Words ending with 'ed': ['charged', 'rejected', 'banned', 'charged', 'banned', 'charged', 'issued', 'discovered', 'decided', 'missed']
Words ending with 'es': ['charges', 'series', 'responses', 'substances', 'rules', 'athletes', 'charges', 'athletes', 'times', 'Games']
Words ending with 's': ['awaits', "'s", 'Kostas', 'Kenteris', 'Athletics', 'Federations', 'drugs', 'tests', 'Athens', 'explanations', 'Christos', 'Tzekos', 'has', 'Kenteris', 'Athens', 'Olympics', 'has', 'Tzekos', 'is', 'federations', 'drugs', 'tests', 'Kenteris', 'tests', 'Olympics', 'Athens']
Words ending with 'un': ['until']
Words ending with 'in': ['in', 'in', 'in', 'in', 'inform', 'in', 'in']
Number of words before stop word removal: 289, after: 168
Number of words before stemming: 168, after: 168
Number of words before lemmatizing: 168, after: 168
Abbreviated words: []
Words with special symbols: ["'s", '(', ')', '.', ',', '.', ':', '``', "'re", '.', "''", ',', '.', ',', '.', '

File: 062.txt - Number of tokens: 252
Words ending with 'ed': ['US-based', 'selected', 'assured', 'confirmed', 'ranked', 'led', 'entered']
Words ending with 'ing': ['competing', 'winning', 'ranking', 'occupying', 'coming', 'being', 'evening', 'running', 'holding']
Words ending with 'es': ['athletes', 'James', 'nine-times']
Words ending with 's': ["'s", "'s", 'hopefuls', 'this', "'s", "'s", 'Championships', 'Europeans', 'his', 'trials', 'his', 'has', 'tremendous', 'weeks', 'is', 'runners', 'this', 'seems', 'Championships', 'is', 'Cross', 'Championships', 'crisis', 'Striders', "'s", 'as', 'as']
Words ending with 'un': ['unlikely']
Words ending with 'in': ['in', 'in', 'in', 'in', 'in', 'in', 'in', 'in', 'in', 'in', 'in', 'in', 'in', 'in', 'injury', 'in', 'in', 'individual']
Number of words before stop word removal: 252, after: 158
Number of words before stemming: 158, after: 158
Number of words before lemmatizing: 158, after: 158
Abbreviated words: []
Words with special symbols: ["'s", "'

File: 072.txt - Number of tokens: 633
Words ending with 'ed': ['managed', 'admitted', 'wanted', 'distracted', 'pressurised', 'managed', 'managed', 'talked', 'need', 'used', 'concerned', 'coped']
Words ending with 'ing': ['Running', 'training', 'sporting', 'taking', 'running', 'running', 'entertaining', 'running', 'running', 'evening', 'shattering', 'evening', 'eating', 'exercising', 'missing', 'training', 'heading', 'exercising', 'running', 'cushioning', 'gearing', 'running', 'jarring', 'feeling', 'writing', 'training', 'raising', 'rowing']
Words ending with 'es': ['duties', 'Holmes', 'athletes', 'lives', 'times', 'sites', 'miles', 'Hospices']
Words ending with 's': ['Olympics', 'was', 'as', "'s", "'s", 'was', 'Fredericks', 'is', 'was', 'intentions', 'was', 'has', 'was', "'s", 'sorts', 'things', 'days', 'was', 'was', 'presentations', 'was', 'was', 'was', 'presentations', "'s", 'was', 'runs', 'is', 'as', 'outdoors', "'s", 'Olympics', 'is', 'was', 'was', 'was', 'This', 'proceeds', 'his',

File: 091.txt - Number of tokens: 320
Words ending with 'ed': ['pushed', 'postponed', 'pushed', 'expected', 'charged', 'asked', 'dismissed', 'gathered', 'arrested', 'expected', 'accused', 'United', 'banned', 'banned', 'discovered', 'filed']
Words ending with 'ing': ['hearing', 'distributing', 'hearing', 'hearing', 'during', 'Anti-Doping', 'being', 'during', 'serving', 'testing', 'following', 'performance-enhancing']
Words ending with 'es': ['James', 'athletes', 'leagues', 'offices', 'agrees', 'States', 'Jones']
Words ending with 's': ['has', 'was', 'has', 'is', 'steroids', "'s", 'clients', 'Bonds', 'stars', 'focus', 'raids', "'s", "'s", 'agents', 'statements', 'interviews', 'raids', 'has', 'basis', 'materials', "'s", 'Chambers', 'is', 'has', 'his', 'allegations', 'drugs']
Words ending with 'un': ['until']
Words ending with 'in': ['in', 'include', 'inquiry', 'into', 'in', 'in', 'in', 'investigation', 'in', 'in']
Number of words before stop word removal: 320, after: 205
Number of words b

File: 104.txt - Number of tokens: 706
Words ending with 'ed': ['volleyed', 'scored', 'watched', 'scored', 'connected', 'smashed', 'stunned', 'limited', 'well-organised', 'blocked', 'halted', 'reached', 'pegged', 'needed', 'fashioned', 'fed', 'forced', 'denied', 'deflected', 'prompted', 'pressed', 'saved', 'headed', 'appeared', 'netted', 'Used', 'Used']
Words ending with 'ing': ['taunting', 'amazing', 'sending', 'putting', 'hushing', 'opening', 'stunning', 'attacking', 'threatening', 'promising', 'living', 'bring', 'remaining', 'picking', 'sidefooting', 'remaining']
Words ending with 'es': ['Blues', 'Morientes', 'Blues', 'losses', 'Blues', 'chances', 'scares', 'Blues', 'minutes', 'Blues', 'minutes', 'minutes', 'minutes', 'minutes', 'minutes', 'Morientes']
Words ending with 's': ['mins', 'seconds', 'boss', 'was', 'fans', 'as', 'his', "'s", 'was', 'his', 'as', 'was', "'s", 'his', 'his', 'lips', 'fans', 'was', 'Reds', 'cross', 'as', 'previous', 'was', "'s", "'s", "'s", 'was', 'Reds', 'Neve

File: 113.txt - Number of tokens: 282
Words ending with 'ed': ['revealed', 'expected', 'revealed', 'asked', 'replied', 'impressed', 'added', 'talented', 'left-footed']
Words ending with 'ing': ['playing', 'going', 'playing', 'seeing', 'Downing', 'Downing', 'doing', 'scoring', 'Downing', 'making']
Words ending with 'es': ['Wes', 'yes', 'injuries', 'deserves']
Words ending with 's': ['Wright-Phillips', 'has', 'Wright-Phillips', 'Netherlands', 'Wright-Phillips', 'his', 'as', 'Wright-Phillips', 'Wright-Phillips', "'s", 'defenders', "'s", 'this', "'s", 'Wright-Phillips', 'his', 'has', 'players', 'impress', 'Wright-Phillips', 'was', 'plans', 'is', 'goals', 'is', 'is', 'players']
Words ending with 'un': []
Words ending with 'in': ['in', 'injury-hit', 'in', 'in', 'in']
Number of words before stop word removal: 282, after: 170
Number of words before stemming: 170, after: 170
Number of words before lemmatizing: 170, after: 170
Abbreviated words: []
Words with special symbols: ['Wright-Phillips',

File: 123.txt - Number of tokens: 246
Words ending with 'ed': ['deepened', 'Injured', 'pulled', 'pulled', 'bruised', 'replaced', 'United', 'played', 'called', 'enforced', 'forced', 'assessed', 'decided', 'summoned', 'pulled']
Words ending with 'ing': ['following', 'King', 'leaving', 'following', 'hamstring', 'King', 'having']
Words ending with 'es': ['worries', 'Wes', 'appearances']
Words ending with 's': ["'s", 'crisis', 'grows', "'s", "'s", "'s", "'s", 'has', 'as', 'has', 'is', "'s", 'was', 'as', 'has', 'looks', 'his', 'was', 'his', 'was', "'s", 'has']
Words ending with 'un': []
Words ending with 'in': ['in', 'in', 'in', 'into', 'injury', 'injury', 'injury']
Number of words before stop word removal: 246, after: 151
Number of words before stemming: 151, after: 151
Number of words before lemmatizing: 151, after: 151
Abbreviated words: []
Words with special symbols: ["'s", "'s", "'s", '.', "'s", ',', 'Sven-Goran', "'s", '.', ',', '.', '.', ',', ',', '.', ',', '.', "'s", '.', '25-year-ol

File: 140.txt - Number of tokens: 445
Words ending with 'ed': ['ridiculed', 'branded', 'expected', 'poised', 'alleged', 'alleged', 'dismissed', 'lied', 'witnessed', 'greeted', 'apologised', 'witnessed', 'locked', 'allowed', 'Indeed', 'added', 'happened', 'concerned', 'failed', 'expected', 'alleged', 'managed', 'agreed', 'added', 'conceded', 'pleased', 'deserved', 'pleased']
Words ending with 'ing': ['during', 'nothing', 'during', 'dressing', 'receiving', 'failing', 'thing', 'weakening', 'talking', 'talking', 'something']
Words ending with 'es': ['Blues', 'themselves']
Words ending with 's': ['has', "'s", 'as', "'s", 'loss', "'s", 'was', 'boss', 'Anders', 'is', 'line-ups', 'this', 'has', 'its', 'says', 'officials', "'s", 'was', 'his', 'assistants', 'is', 'Londoners', 'press', 'Thomas', 'as', 'as', 'is', 'press', 'is', 'incidents', 'his', "'s", 'was', "'s", 'was', 'is', 'is', 'is', 'was', 'his', "'s", 'his', 'was', 'news', 'is', 'has', 'goals', 'this', 'players']
Words ending with 'un': 

File: 157.txt - Number of tokens: 369
Words ending with 'ed': ['dejected', 'claimed', 'turned', 'faced', 'need', 'tried', 'failed', 'conceded', 'helped', 'conceded']
Words ending with 'ing': ['adding', 'crying', 'scoring', 'Everything']
Words ending with 'es': ['gives', 'gives', 'goes', 'sometimes', 'chances']
Words ending with 's': ['as', 'was', 'Champions', "'s", 'Gunners', 'is', 'players', 'us', 'was', 'goals', 'his', 'as', 'is', 'is', 'this', 'nights', 'This', 'was', 'players', 'as', 'as', "'s", 'was', 'players', 'goals', 'us', 'Jens', 'was', 'As', 'always', 'goals', 'us', 'goals', 'happens', 'is']
Words ending with 'un': []
Words ending with 'in': ['in', 'in', 'in', 'in', 'in', 'in', 'in']
Number of words before stop word removal: 369, after: 225
Number of words before stemming: 225, after: 225
Number of words before lemmatizing: 225, after: 225
Abbreviated words: []
Words with special symbols: ['3-1', '``', "''", '.', "'s", ',', ':', '``', '.', '``', 'dressing-room', '.', '.', '`

Number of words before stop word removal: 154, after: 92
Number of words before stemming: 92, after: 92
Number of words before lemmatizing: 92, after: 92
Abbreviated words: []
Words with special symbols: [',', '.', ',', ',', '``', "''", '.', "'s", '3-1', '.', '``', ',', "''", '.', ':', '``', '.', '``', '.', "''", '.']
File: 176.txt - Number of tokens: 365
Words ending with 'ed': ['praised', 'worked', 'moved', 'battled', 'added', 'helped', 'sympathised']
Words ending with 'ing': ['king', 'struggling', 'evening', 'something', 'finding']
Words ending with 'es': ['Les', 'believes', 'articles']
Words ending with 's': ['hails', 'boss', 'has', 'his', 'this', 'his', "'s", 'is', 'knows', 'is', 'has', 'is', 'Leeds', 'has', 'fitness', 'goals', 'trails', 'is', 'has', "'s", 'goals', 'boss', 'afterwards', 'His', 'is', 'is', 'is', 'as', 'as']
Words ending with 'un': []
Words ending with 'in': ['in', 'in', 'in', 'into', 'in', 'in']
Number of words before stop word removal: 365, after: 213
Number of wo

File: 190.txt - Number of tokens: 287
Words ending with 'ed': ['humbled', 'changed', 'divided', 'hosted', 'crowned']
Words ending with 'ing': ['embarrassing']
Words ending with 'es': ['failures', 'Seychelles', 'Seychelles', 'Seychelles', 'matches', 'Seychelles', 'Seychelles', 'advances', 'comprises', 'Seychelles']
Words ending with 's': ['Mauritius', 'seeds', 'minnows', 'Mauritius', 'his', 'has', 'this', 'participants', 'teams', 'days', 'champions', 'Mauritius', 'opponents', 'Mauritius', 'winners', "'s", 'hosts', 'champions', 'winners', 'holders', 'mini-tournaments', 'winners', 'Mauritius', 'Winners', 'Winners', 'Winners']
Words ending with 'un': []
Words ending with 'in': ['in', 'in', 'in', 'in', 'into', 'in', 'in', 'in', 'in', 'in', 'in', 'in', 'in', 'in']
Number of words before stop word removal: 287, after: 185
Number of words before stemming: 185, after: 185
Number of words before lemmatizing: 185, after: 185
Abbreviated words: []
Words with special symbols: ['.', '2-0', '.', 'new

File: 207.txt - Number of tokens: 467
Words ending with 'ed': ['denied', 'resigned', 'undermined', 'talked', 'caused', 'added', 'countered', 'denied', 'based', 'started', 'surprised', 'caused', 'worked', 'organised', 'disciplined', 'happened']
Words ending with 'ing': ['sporting', 'thing', 'timing', 'coming', 'timing', 'sporting', 'having', 'sporting', 'working', 'something', 'doing', 'forward-thinking', 'being', 'conceding']
Words ending with 'es': ['denies', 'Jacques', 'personalities', 'games', 'Jacques', 'comes', 'sees', 'principles']
Words ending with 's': ['has', "'s", 'is', 'is', 'is', 'his', 'problems', 'has', 'this', 'is', 'was', 'problems', 'has', 'was', 'is', 'talks', 'less', 'hours', "'s", 'problems', 'reports', 'is', 'is', 'Spurs', "'s", 'is', 'this', 'Spurs', 'Stevens', 'problems', 'Stevens', 'was', 'whereas', 'was', 'goals', 'ideals', 'seems', 'has']
Words ending with 'un': ['unable']
Words ending with 'in': ['in', 'into', 'in', 'in', 'in', 'in']
Number of words before st

File: 220.txt - Number of tokens: 158
Words ending with 'ed': ['interested', 'delighted', 'subjected', 'happened']
Words ending with 'ing': ['beating', 'being', 'going', 'thing', 'pulling', 'taking']
Words ending with 'es': ['sides']
Words ending with 's': ['was', 'was', 'clubs', 'was', "'s", 'us', 'goals', "'s", 'was', 'players', 'was', 'fans', 'this', 'is', "'s"]
Words ending with 'un': []
Words ending with 'in': ['in', 'in', 'in']
Number of words before stop word removal: 158, after: 86
Number of words before stemming: 86, after: 86
Number of words before lemmatizing: 86, after: 86
Abbreviated words: []
Words with special symbols: ['two-goal', '.', '4-0', ':', '``', '.', '``', '.', '``', "'s", '.', "'m", "'s", ',', '.', "''", ',', ':', '``', '.', '``', "'s", "'ve", "n't", '.', "''"]
File: 221.txt - Number of tokens: 251
Words ending with 'ed': ['considered', 'failed', 'involved']
Words ending with 'ing': ['winning', 'Evening', 'looking', 'playing', 'winning', 'staying', 'trying', 'l

Number of words before lemmatizing: 527, after: 527
Abbreviated words: []
Words with special symbols: ['.', "'s", 'long-running', '``', "''", '.', ',', ':', '``', '.', "''", '.', 'Ferguson-Wenger', ',', ',', '.', '2-0', ',', ',', "'s", '.', '.', '``', ',', ',', ',', "''", '.', '``', "'what", '?', "'", '``', '.', "'s", ',', "n't", ',', "'s", '.', "''", "'s", ',', '.', ':', '``', "'ve", '.', '``', ',', '.', ',', '.', '``', "n't", "n't", '.', '.', '``', '.', '.', "''", ',', ',', '.', '``', ',', "''", '.', '``', "n't", '(', ')', '.', '``', '(', ')', '.', '``', '.', '.', '``', '.', "n't", '.', '.', '...', '.', "''", '``', "''", ',', ':', '``', "n't", '.', '.', "'bringing", "'", '.', '``', ',', '.', "''", 'vice-chairman', '.', ':', '``', '.', '.', '``', '.', '``', '.', "''", ',', 'on-going', '.', ',', "'s", '.', '``', "''", 'Â£15,000', '``', "''", '.', "'s", ',', ',', '``', '(', ')', "''", ',', '.', '``', ',', "''", '.', '``', ',', "n't", '.', ',', "n't", '.', '``', "'s", '.', '.', '``', ','

File: 251.txt - Number of tokens: 681
Words ending with 'ed': ['completed', 'United', 'sealed', 'delighted', 'Geneva-based', 'revealed', 'added', 'United', 'United', 'succeeded', 'worked', 'United', 'revealed', 'asked', 'undisclosed', 'agreed', 'joined', 'refused', 'involved', 'used', 'stepped', 'rejected', 'failed', 'revealed', 'spared']
Words ending with 'ing': ['stepping', 'everything', 'training', 'going', 'remaining', 'firefighting', 'running', 'Outgoing', 'reducing', 'ongoing', 'incoming', 'outgoing', 'going', 'stretching', 'following']
Words ending with 'es': ['Bates', 'Bates', 'times', 'Bates', 'Bates', 'ensures', 'Bates', 'Bates', 'Bates', 'Bates', 'finances', 'figures', 'cheques', 'figures', 'Bates', 'Bates']
Words ending with 's': ['seals', 'has', 'his', 'Leeds', 'has', 'Leeds', 'as', 'has', 'belongs', 'fans', 'his', 'as', 'Sports', 'his', 'is', 'Leeds', "'s", 'is', 'creditors', 'is', 'has', 'Leeds', 'This', 'proposals', 'Leeds', 'Leeds', 'has', 'problems', 'Leeds', 'problem

File: 268.txt - Number of tokens: 681
Words ending with 'ed': ['proved', 'lashed', 'scored', 'blasted', 'placed', 'caused', 'missed', 'headed', 'blasted', 'cleared', 'played', 'shrugged', 'planted', 'approached', 'teased', 'failed', 'closed', 'slipped', 'played', 'checked', 'interested', 'returned', 'needed', 'sniffed', 'needed', 'backed', 'stretched', 'escaped', 'palmed', 'used', 'used']
Words ending with 'ing': ['marking', 'saving', 'playing', 'having', 'handling', 'cling', 'growing', 'struggling']
Words ending with 'es': ['rescues', 'chances', 'bodies', 'stages', 'minutes', 'minutes', 'minutes', 'inches', 'minutes', 'minutes', 'reflexes', 'minutes', 'Pires']
Words ending with 's': ['his', 'months', 'points', 'his', 'was', 'yards', 'as', 'Primus', 'as', "'s", 'was', 'his', 'hearts', 'his', 'as', 'press', 'pass', 'was', 'anxious', 'seconds', 'as', 'his', 'yards', 'As', 'players', 'as', 'was', "'s", 'pass', "'s", 'was', 'weeks', "'s", 'as', 'scraps', 'his', 'as', 'ferocious', 'his', "'

Number of words before stop word removal: 467, after: 271
Number of words before stemming: 271, after: 271
Number of words before lemmatizing: 271, after: 271
Abbreviated words: []
Words with special symbols: ['Fly-half', "'s", '18-17', '.', '.', '``', "'m", ',', "''", '.', '``', "'s", '.', "'s", "n't", '.', "''", '``', "''", ',', '.', '``', "n't", ',', "''", '.', '``', '.', "'s", ',', '.', "''", '``', "''", '.', ',', '.', '``', '?', "''", '.', '``', '.', '``', "'s", '.', ',', "n't", '.', '``', ',', '.', '``', '.', ',', "'ve", '.', "''", 'half-time', '17-6', '.', "'s", 'second-half', '.', '``', 'half-time', ',', "''", '.', '``', '.', '.', '``', "'s", '.', ',', '.', "''"]
File: 285.txt - Number of tokens: 429
Words ending with 'ed': ['ruled', 'injured', 'joined', 'returned', 'followed', 'fractured', 'recalled', 'suffered', 'ruled', 'involved', 'excelled', 'called', 'played', 'altered', 'endured', 'limped', 'boosted']
Words ending with 'ing': ['breaking', 'opening', 'remaining', 'trainin

File: 300.txt - Number of tokens: 327
Words ending with 'ed': ['added', 'sidelined', 'confirmed', 'travelled', 'inexperienced', 'uncapped', 'drafted', 'started', 'limped', 'missed', 'expected', 'Uncapped', 'named', 'strengthened']
Words ending with 'ing': ['starting', 'training', 'kicking', 'during', 'training']
Words ending with 'es': ['Wales', 'penalties', 'rates']
Words ending with 's': ['has', "'s", 'Nations', 'weeks', 'his', 'his', 'sights', 'as', 'Sharks', 'his', 'favours', 'is', 'was', 'tryscorers', 'days', 'has', "'s", 'was', "'s", 'is', 'his', "'s", 'obvious', 'as', 'has', 'his', 'claims', 'points', "'s", "'s", 'players']
Words ending with 'un': []
Words ending with 'in': ['in', 'injury', 'international', 'in', 'into', 'in', 'injury', 'in', 'inside', 'in-form', 'in']
Number of words before stop word removal: 327, after: 213
Number of words before stemming: 213, after: 213
Number of words before lemmatizing: 213, after: 213
Abbreviated words: []
Words with special symbols: ["'s

File: 312.txt - Number of tokens: 829
Words ending with 'ed': ['secured', 'sealed', 'added', 'maintained', 'scored', 'fielded', 'danced', 'appeared', 'switched', 'retained', 'slotted', 'threatened', 'failed', 'attempted', 'charged', 'snaffled', 'hared', 'bounced', 'regained', 'jumped', 'ended', 'off-loaded', 'showed', 'landed', 'threatened', 'sparked', 'finished', 'angled', 'suffered']
Words ending with 'ing': ['Starting', 'opening', 'wing', 'onrushing', 'lurking', 'kicking', 'sporting', 'stuttering', 'turning', 'claiming', 'lobbing', 'breathing', 'streaking', 'padding', 'adding', 'converting', 'sending', 'icing', 'combining', 'hamstring', 'looking']
Words ending with 'es': ['Wales', 'Wales', 'Tries', 'Jones', 'Wales', 'minutes', 'Jones', 'duties', 'Wales', 'scores', 'Wales', 'Jones', 'Wales', 'themselves', 'Jones', 'Wales', 'tries', 'minutes', 'minutes', 'Jones', 'Wales', 'Wales', 'Jones', 'Davies', 'Jones']
Words ending with 's': ['Nations', 'years', 'Thomas', 'Williams', 'visitors',

File: 326.txt - Number of tokens: 338
Words ending with 'ed': ['rubbished', 'need', 'proved', 'need', 'demoted']
Words ending with 'ing': ['striving', 'disappointing', 'fighting', 'outstanding', 'kicking', 'working', 'doing']
Words ending with 'es': ['Wales', 'does', 'themselves', 'believes', 'performances', 'Wales', 'Wales', 'games']
Words ending with 's': ['answers', 'critics', 'has', 'suggestions', 'champions', 'Nations', 'champions', 'players', 'points', 'this', 'was', 'this', 'words', 'his', 'has', 'is', 'has', 'is']
Words ending with 'un': ['under-perform']
Words ending with 'in': ['in', 'in', 'in', 'in', 'in', 'in', 'international', 'in', 'in']
Number of words before stop word removal: 338, after: 186
Number of words before stemming: 186, after: 186
Number of words before lemmatizing: 186, after: 186
Abbreviated words: []
Words with special symbols: ['.', '11-9', '.', ':', '``', '.', "n't", '.', '``', "'ve", '.', '.', "''", '.', 'full-back', 'new-look', '.', '``', ',', "''", '.'

File: 338.txt - Number of tokens: 532
Words ending with 'ed': ['declared', 'infected', 'suffered', 'infected', 'added', 'hoped', 'forced', 'injured', 'confirmed', 'missed', 'disappointed', 'responded', 'missed', 'added', 'improved']
Words ending with 'ing': ['joining', 'playing', 'playing', 'playing', 'during', 'outing', 'starting', 'coming', 'frustrating', 'winning', 'coming', 'goalkicking']
Words ending with 'es': ['Jones', 'chances', 'appearances', 'penalties', 'circumstances', 'Wales', 'does']
Words ending with 's': ['joins', 'Lewis', 'has', "'s", 'Nations', 'has', 'his', 'as', 'fails', 'his', 'Chris', 'is', 'was', 'antibiotics', 'his', 'as', 'has', 'antibiotics', 'has', 'Stevens', 'has', 'starts', 'his', 'has', 'this', 'Stevens', 'boys', 'years', 'calls', 'Stevens', 'his', 'as', 'has', 'his', "'s", 'is', 'his', 'this', 'admits', 'his', 'partnerships', 'line-outs', 'us', 'has', 'was', 'us']
Words ending with 'un': []
Words ending with 'in': ['infection', 'in', 'in', 'infection', 'i

File: 353.txt - Number of tokens: 291
Words ending with 'ed': ['worsened', 'ruled', 'missed', 'added', 'trained']
Words ending with 'ing': ['opening', 'damaging', 'during', 'ruling', 'damaging', 'suffering', 'training', 'coming', 'awaiting']
Words ending with 'es': ['matches', 'sidelines', 'games', 'James', 'knees', 'opportunities', 'negatives', 'Tykes']
Words ending with 's': ['Scots', "'s", 'crisis', 'has', 'Nations', 'news', 'miss', 'Borders', 'has', 'joins', 'miss', 'his', 'ligaments', 'Paris', 'his', 'was', 'has', 'ligaments', "'s", 'has', 'ligaments', 'this', "'s", 'sessions', 'players', 'represents', 'Williams', 'Williams', 'Cross', 'Leeds', 'is', 'his']
Words ending with 'un': []
Words ending with 'in': ['injury', 'injury', 'in', 'in', 'in', 'in']
Number of words before stop word removal: 291, after: 191
Number of words before stemming: 191, after: 191
Number of words before lemmatizing: 191, after: 191
Abbreviated words: []
Words with special symbols: ["'s", '.', '.', '.', '('

File: 367.txt - Number of tokens: 461
Words ending with 'ed': ['announced', 'captained', 'revered', 'led', 'ruled', 'captained', 'honoured']
Words ending with 'ing': ['leading', 'playing', 'playing', 'captaining', 'guiding', 'intimidating', 'aiming']
Words ending with 'es': ['announces', 'times', 'does', 'victories', 'titles']
Words ending with 's': ['has', 'is', 'his', 'caps', 'months', "'s", "'s", 'his', 'was', 'Tigers', 'is', 'this', 'is', 'is', 'reasons', 'always', 'fans', 'his', 'achievements', 'is', 'Lions', 'Nations', 'success', 'as', 'Tigers', 'his', 'marvellous', 'was', 'rows', 'rucks', 'mauls', 'was', "'s", 'His', 'his', 'is', 'expects', 'This', "'s", 'his', 'exploits', 'was', "'s", 'is']
Words ending with 'un': ['union', 'union']
Words ending with 'in': ['in', 'in', 'in', 'in', 'in', 'in', 'integral', 'in', 'in', 'in', 'in']
Number of words before stop word removal: 461, after: 268
Number of words before stemming: 268, after: 268
Number of words before lemmatizing: 268, afte

File: 377.txt - Number of tokens: 734
Words ending with 'ed': ['scored', 'claimed', 'poached', 'helped', 'extended', 'entered', 'led', 'posed', 'pressed', 'played', 'adjudged', 'continued', 'showed', 'led', 'charged', 'pounced', 'replied', 'slotted', 'sustained', 'produced', 'culminated', 'remained', 'Sustained', 'rewarded', 'responded', 'suffered', 'sin-binned', 'punished', 'frustrated', 'wasted', 'enabled', 'extended', 'closed', 'produced', 'deserved']
Words ending with 'ing': ['searching', 'opting', 'denying', 'slotting', 'killing', 'notching', 'remaining', 'closing']
Words ending with 'es': ['penalties', 'penalties', 'penalties', 'minutes', 'minutes', 'Hayes']
Words ending with 's': ["'s", 'points', 'as', "'s", "'s", 'as', 'famous', 'was', 'Africans', 'Denis', 'was', "'s", 'pass', 'was', 'Honiss', 'press', 'points', "'s", 'points', 'Springboks', 'as', 'his', 'Honiss', 'his', 'players', 'infringements', 'Africans', 'seconds', "'s", 'Springboks', "'s", 'Africans', 'Springboks', "'s",

File: 389.txt - Number of tokens: 253
Words ending with 'ed': ['based', 'called', 'toured', 'forced', 'prolonged', 'witnessed', 'need', 'need']
Words ending with 'ing': ['eyeing', 'concentrating', 'playing']
Words ending with 'es': ['hopes', 'takes', 'does']
Words ending with 's': ['Lions', 'harbours', 'Lions', 'Lions', 'has', 'his', 'Nations', 'this', 'Lions', 'was', 'years', 'his', 'Tests', 'quarters', 'players', 'feels', 'has']
Words ending with 'un': []
Words ending with 'in': ['in', 'in', 'injury', 'international', 'in', 'in', 'international']
Number of words before stop word removal: 253, after: 158
Number of words before stemming: 158, after: 158
Number of words before lemmatizing: 158, after: 158
Abbreviated words: []
Words with special symbols: ['.', '.', ',', ',', ':', '``', "n't", '.', '``', ',', "''", '.', '.', '32-year-old', 'shelf-life', '.', ':', '``', "'ll", '.', '``', ',', "n't", ',', '.', '``', "'m", ',', "'m", '.', '``', ',', '.', ',', "n't", ',', '?', "''"]
File: 39

File: 403.txt - Number of tokens: 665
Words ending with 'ed': ['heralded', 'dumped', 'ruled', 'inspired', 'dogged', 'clinched', 'struggled', 'lifted', 'dedicated']
Words ending with 'ing': ['enduring', 'enduring', 'teething', 'bedding', 'blossoming', 'kicking', 'leading', 'exciting', 'buzzing', 'shedding', 'joining', 'lacking', 'reigning', 'winning']
Words ending with 'es': ['Wales', 'Wales', 'Wales', 'matches', 'losses', 'Wales', 'matches', 'games', 'Wales', 'Jones', 'Wales', 'times', 'Wales', 'Les']
Words ending with 's': ['Celts', 'Nations', 'has', 'this', 'guns', 'weeks', 'knows', 'as', 'ridiculous', "'s", 'months', 'as', 'champions', 'winners', 'was', 'has', 'stars', 'is', 'problems', 'his', 'Nations', 'was', 'success', 'Edwards', 'Williams', 'fans', 'tenterhooks', 'dawns', 'expectations', 'is', 'Thomas', 'Williams', 'runners', 'Williams', 'has', 'years', 'Nations', 'years', 'As', 'this', 'winners', 'Nations', 'scalps', 'is', 'is', 'materials', 'champions', 'meetings', 'romantics'

Number of words before lemmatizing: 539, after: 539
Abbreviated words: []
Words with special symbols: ['17-18', 'scrum-half', '.', '17-6', 'half-time', '.', ',', '.', ',', '.', ',', '.', "'s", ',', '.', '.', '.', ',', '.', ',', '.', 'tit-for-tat', ',', ',', 'line-out', '.', ',', '.', "'s", '.', '10-3', '.', 'fly-half', 'full-back', ',', ',', '.', 'longer-range', ',', '.', 'off-loaded', "'s", '.', '.', ',', ',', '.', '.', '-', '-', 'scrum-half', "'s", '.', 'long-range', '.', ',', '.', ',', "'s", ',', '.', ',', 'drop-goal', '.', ',', '.', '(', ')', ';', ',', ',', ',', ';', ',', ';', ',', ',', ';', ',', ';', ',', ',', '.', ',', ',', ',', ',', ',', ',', '.', ';', ',', ',', ',', ';', ',', ';', ',', ',', ';', '(', ')', ',', ',', ',', ',', '.', ',', ',', ',', ',', ',', ',', 'J-P', '.', "O'Brien", '(', ')']
File: 415.txt - Number of tokens: 472
Words ending with 'ed': ['targeted', 'missed', 'played', 'wasted', 'missed', 'added', 'blamed', 'proved', 'introduced', 'added', 'need']
Words ending w

File: 432.txt - Number of tokens: 302
Words ending with 'ed': ['criticised', 'disputed', 'denied', 'added', 'moved', 'added']
Words ending with 'ing': ['Reacting', 'insulting', 'insulting', 'insisting', 'talking', 'speaking', 'following', 'offering', 'playing', 'going', 'exciting', 'exciting', 'bringing']
Words ending with 'es': ['does']
Words ending with 's': ['hits', 'has', 'Phillips', 'is', 'is', 'was', 'Phillips', 'flowers', "'s", 'Phillips', 'was', 'was', 'humorous', 'grounds', 'was', 'this', "'s", 'Opens', 'sets', "'s", "'s", 'minds', 'Williams', 'is', "'s", 'tennis', 'is', "'s", 'tennis', 'is', 'as', 'spectators', 'is']
Words ending with 'un': []
Words ending with 'in': ['interview', 'in', 'in', 'in']
Number of words before stop word removal: 302, after: 196
Number of words before stemming: 196, after: 196
Number of words before lemmatizing: 196, after: 196
Abbreviated words: []
Words with special symbols: ['.', ',', ':', '``', '.', '``', ',', ',', "n't", '.', "'s", '.', "''", '

File: 450.txt - Number of tokens: 357
Words ending with 'ed': ['brushed', 'strolled', 'shocked', 'seed', 'rattled', 'recovered', 'raced', 'loosened', 'lifted', 'played', 'short-priced', 'ranked', 'forced', 'played', 'wanted', 'need']
Words ending with 'ing': ['defending', 'getting', 'dropping', 'qualifying', 'qualifying']
Words ending with 'es': ['overcomes', 'games', 'times', 'times', 'times', 'takes']
Words ending with 's': ["'s", 'was', 'as', 'was', 'was', 'swings', 'blocks', 'was', 'has', 'years', 'keeps', 'this', 'as', 'his', 'is', 'was', 'his', 'this', 'was', 'success', 'was', 'nervous', "'s", 'this', 'is', 'points', "'s"]
Words ending with 'un': []
Words ending with 'in': ['in', 'in', 'in', 'in', 'into', 'in', 'in', 'in', 'in', 'in']
Number of words before stop word removal: 357, after: 219
Number of words before stemming: 219, after: 219
Number of words before lemmatizing: 219, after: 219
Abbreviated words: []
Words with special symbols: ["'s", '.', '7-5', '6-0', '.', '.', '.',

File: 469.txt - Number of tokens: 530
Words ending with 'ed': ['described', 'focused', 'lived', 'nicknamed', 'admitted', 'obsessed', 'obsessed', 'played', 'Aged', 'frustrated']
Words ending with 'ing': ['missing', 'beating', 'dreaming', 'winning', 'something', 'waiting', 'opening', 'going', 'playing', 'losing']
Words ending with 'es': ['times', 'singles', 'singles', 'chances']
Words ending with 's': ['Davis', 'Carlos', "'s", 'Davis', 'as', 'his', "'s", "'s", 'hosts', 'nights', 'this', 'has', "'s", 'Davis', 'years', 'Davis', 'is', 'was', 'nervous', 'is', 'this', 'helps', 'goals', "'s", 'is', 'was', 'his', 'has', 'us', 'this', 'was', 'his', 'years', 'this', "'s", 'was', 'years', 'days', 'Davis', 'afterwards', 'wants', 'his', 'tennis', 'skills', 'guys', 'courts', 'things', "'s", 'events', 'guys', 'this', 'was', 'his', "'s", "'s", 'was', 'clay-courters', 'is', 'us', 'this', 'business', 'us', "'s", 'as', 'as']
Words ending with 'un': ['unassailable', 'unrealistic']
Words ending with 'in': [

Number of words before stop word removal: 328, after: 189
Number of words before stemming: 189, after: 189
Number of words before lemmatizing: 189, after: 189
Abbreviated words: []
Words with special symbols: ['18-month', '.', '.', '``', 're-hire', ',', "''", '.', '``', "'s", '.', "''", ',', '.', ',', '.', ',', '.', ',', ',', '.', 'first-round', '.', '.', '.', '``', ',', "''", '.', '``', '.', '``', ',', '.', "''"]
File: 482.txt - Number of tokens: 204
Words ending with 'ed': ['ended', 'added', 'worked', 'linked']
Words ending with 'ing': ['confirming', 'reading', 'looking', 'Speaking', 'being']
Words ending with 'es': ['qualities', 'bases']
Words ending with 's': ['talks', 'is', 'Davis', 'as', 'his', 'his', 'admits', 'talks', 'terms', 'wants', 'is', "'s", "'s", 'his', 'is', 'wants', 'his', 'wants', 'wants', "'s", 'has', "'s", 'was', 'is']
Words ending with 'un': []
Words ending with 'in': ['in', 'in', 'in', 'in']
Number of words before stop word removal: 204, after: 126
Number of words

File: 493.txt - Number of tokens: 736
Words ending with 'ed': ['epitomised', 'marked', 'United', 'ended', 'rejected', 'required', 'admitted', 'played', 'moulded']
Words ending with 'ing': ['rallying', 'asking', 'spending', 'teaching', 'glowing', 'taking', 'finding', 'something', 'seeing', 'going', 'going', 'going', 'going', 'training', 'nothing', 'thing', 'something', 'nothing']
Words ending with 'es': ['hopes', 'qualities', 'hopes', 'services', 'goes', 'coaches', 'States', 'does']
Words ending with 's': ['Connors', 'tennis', 'guts', 'questions', 'Connors', "'s", 'tennis', 'months', 'years', 'Tennis', 'days', 'was', 'is', 'hits', "'s", 'Connors', 'was', 'kids', 'Connors', 'is', 'his', 'his', 'has', 'Connors', 'is', 'pains', 'his', 'his', 'has', 'Connors', "'s", 'talks', 'puts', 'days', 'was', 'This', 'is', 'workers', "'s", 'this', 'is', 'his', 'tennis', 'officials', 'discussions', 'Tennis', 'years', "'s", 'Connors', 'is', 'his', 'dealings', "'s", 'players', "'s", "'s", "'s", 'has', 'as

File: 511.txt - Number of tokens: 263
Words ending with 'ed': ['seed', 'seed', 'battled', 'seed', 'twisted', 'admitted', 'rushed', 'revealed', 'stayed']
Words ending with 'ing': ['defending', 'feeling', 'being', 'frustrating', 'anything', 'fighting', 'fighting']
Words ending with 'es': ['chances', 'Sometimes']
Words ending with 's': ["'s", 'Haas', 'was', 'was', 'his', 'points', 'Haas', "'s", 'Haas', 'backhands', "'s", 'his', "'s"]
Words ending with 'un': []
Words ending with 'in': ['in', 'in', 'in', 'in', 'in', 'in', 'into', 'in']
Number of words before stop word removal: 263, after: 162
Number of words before stemming: 162, after: 162
Number of words before lemmatizing: 162, after: 162
Abbreviated words: []
Words with special symbols: ['.', "'s", ',', ',', '7-6', '(', '7-3', ')', '6-3', '.', '``', ',', "''", '.', '``', '.', "''", '6-7', '(', '3-7', ')', '6-3', '6-3', ',', '.', 'first-set', 'tie-break', '.', '4-2', '.', '``', "'s", 'top-five', "n't", ',', "''", '.', '``', '.', "''", "'

In [13]:
import os
import re
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Define the file path for the BBC news dataset
data_dir = r'E:\New folder (2)\bbc\sport'  # Update with your dataset directory

# Tokenization using NLTK word tokenizer
def tokenize_file(file_path):
    with open(file_path, 'r') as file:
        content = file.read()
        tokens = word_tokenize(content)
    return tokens

# Removing stop words
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return filtered_tokens

# Stemming words
def stem_words(tokens):
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    return stemmed_tokens

# Lemmatizing words
def lemmatize_words(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return lemmatized_tokens

# Process each file in the dataset
results = []

for filename in os.listdir(data_dir):
    if filename.endswith('.txt'):
        file_path = os.path.join(data_dir, filename)
        tokens_before = tokenize_file(file_path)
        num_words_before = len(tokens_before)

        # Removing stop words
        tokens_after_stopwords = remove_stopwords(tokens_before)
        num_words_after_stopwords = len(tokens_after_stopwords)

        # Stemming words
        stemmed_tokens = stem_words(tokens_after_stopwords)
        num_words_after_stemming = len(stemmed_tokens)

        # Lemmatizing words
        lemmatized_tokens = lemmatize_words(tokens_after_stopwords)
        num_words_after_lemmatizing = len(lemmatized_tokens)

        # Store the results
        results.append({
            'File': filename,
            'Before (Tokenization)': num_words_before,
            'After (Stopwords)': num_words_after_stopwords,
            'After (Stemming)': num_words_after_stemming,
            'After (Lemmatizing)': num_words_after_lemmatizing
        })

# Convert the results to a DataFrame
results_df = pd.DataFrame(results)

# Print the DataFrame
print(results_df)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\win10\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\win10\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\win10\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


        File  Before (Tokenization)  After (Stopwords)  After (Stemming)  \
0    001.txt                    235                150               150   
1    002.txt                    158                106               106   
2    003.txt                    435                252               252   
3    004.txt                    226                139               139   
4    005.txt                    192                131               131   
..       ...                    ...                ...               ...   
506  507.txt                    275                186               186   
507  508.txt                    153                112               112   
508  509.txt                    252                167               167   
509  510.txt                    354                212               212   
510  511.txt                    263                162               162   

     After (Lemmatizing)  
0                    150  
1                    106  
2     

In [15]:
import os
import re
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Define the file path for the BBC news dataset
data_dir = r'E:\New folder (2)\bbc\sport'  # Update with your dataset directory

# Tokenization using NLTK word tokenizer
def tokenize_file(file_path):
    with open(file_path, 'r') as file:
        content = file.read()
        tokens = word_tokenize(content)
    return tokens

# Removing stop words
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return filtered_tokens

# Stemming words
def stem_words(tokens):
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    return stemmed_tokens

# Lemmatizing words
def lemmatize_words(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return lemmatized_tokens

# Process each file in the dataset
results = []

for filename in os.listdir(data_dir):
    if filename.endswith('.txt'):
        file_path = os.path.join(data_dir, filename)
        tokens_before = tokenize_file(file_path)
        num_words_before = len(tokens_before)

        # Removing stop words
        tokens_after_stopwords = remove_stopwords(tokens_before)
        num_words_after_stopwords = len(tokens_after_stopwords)

        # Stemming words
        stemmed_tokens = stem_words(tokens_after_stopwords)
        num_words_after_stemming = len(stemmed_tokens)

        # Lemmatizing words
        lemmatized_tokens = lemmatize_words(tokens_after_stopwords)
        num_words_after_lemmatizing = len(lemmatized_tokens)

        # Store the results
        results.append({
            'File': filename,
            'Before (Tokenization)': num_words_before,
            'After (Stopwords)': num_words_after_stopwords,
            'After (Stemming)': num_words_after_stemming,
            'After (Lemmatizing)': num_words_after_lemmatizing
        })

# Convert the results to a DataFrame
results_df = pd.DataFrame(results)

# Print the DataFrame
print(results_df)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\win10\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\win10\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\win10\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


        File  Before (Tokenization)  After (Stopwords)  After (Stemming)  \
0    001.txt                    235                150               150   
1    002.txt                    158                106               106   
2    003.txt                    435                252               252   
3    004.txt                    226                139               139   
4    005.txt                    192                131               131   
..       ...                    ...                ...               ...   
506  507.txt                    275                186               186   
507  508.txt                    153                112               112   
508  509.txt                    252                167               167   
509  510.txt                    354                212               212   
510  511.txt                    263                162               162   

     After (Lemmatizing)  
0                    150  
1                    106  
2     

In [3]:
import os
import re
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Define the file path for the BBC news dataset
data_dir = r'E:\New folder (2)\bbc\sport'  # Update with your dataset directory

# Tokenization using NLTK word tokenizer
def tokenize_file(file_path):
    with open(file_path, 'r') as file:
        content = file.read()
        tokens = word_tokenize(content)
    return tokens

# Removing stop words
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return filtered_tokens

# Noise removal
def remove_noise(tokens):
    # Remove punctuation and special characters
    cleaned_tokens = [word for word in tokens if word.isalpha()]
    return cleaned_tokens

# Process abbreviations
def process_abbreviations(tokens):
    abbreviation_mapping = {
        'i.e.': 'that is',
        'e.g.': 'for example',
        'u.s.': 'United States'
    }
    processed_tokens = [abbreviation_mapping[word.lower()] if word.lower() in abbreviation_mapping else word for word in tokens]
    return processed_tokens

# Remove numbers
def remove_numbers(tokens):
    cleaned_tokens = [word for word in tokens if not word.isdigit()]
    return cleaned_tokens

# Process each file in the dataset
results = []

for filename in os.listdir(data_dir):
    if filename.endswith('.txt'):
        file_path = os.path.join(data_dir, filename)
        tokens_before = tokenize_file(file_path)
        num_words_before = len(tokens_before)

        # Removing stop words
        tokens_after_stopwords = remove_stopwords(tokens_before)
        num_words_after_stopwords = len(tokens_after_stopwords)

        # Noise removal
        tokens_after_noise_removal = remove_noise(tokens_after_stopwords)
        num_words_after_noise_removal = len(tokens_after_noise_removal)

        # Process abbreviations
        tokens_after_abbreviations = process_abbreviations(tokens_after_noise_removal)

        # Remove numbers
        tokens_after_number_removal = remove_numbers(tokens_after_abbreviations)
        num_words_after_number_removal = len(tokens_after_number_removal)

        # Store the results
        results.append({
            'File': filename,
            'Before (Tokenization)': num_words_before,
            'After (Stopwords)': num_words_after_stopwords,
            'After (Noise Removal)': num_words_after_noise_removal,
            'After (Abbreviations)': len(tokens_after_abbreviations),
            'After (Number Removal)': num_words_after_number_removal
        })

# Convert the results to a DataFrame
results_df = pd.DataFrame(results)

# Print the DataFrame
print(results_df)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\win10\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\win10\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\win10\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


        File  Before (Tokenization)  After (Stopwords)  After (Noise Removal)  \
0    001.txt                    235                150                    116   
1    002.txt                    158                106                     81   
2    003.txt                    435                252                    182   
3    004.txt                    226                139                    101   
4    005.txt                    192                131                     87   
..       ...                    ...                ...                    ...   
506  507.txt                    275                186                    129   
507  508.txt                    153                112                     84   
508  509.txt                    252                167                    106   
509  510.txt                    354                212                    169   
510  511.txt                    263                162                    102   

     After (Abbreviations) 

In [4]:
import os
import re
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Define the file path for the BBC news dataset
data_dir = r'E:\New folder (2)\bbc\sport'  # Update with your dataset directory

# Tokenization using NLTK word tokenizer
def tokenize_file(file_path):
    with open(file_path, 'r') as file:
        content = file.read()
        tokens = word_tokenize(content)
    return tokens

# Removing stop words
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    removed_stopwords = [word for word in tokens if word.lower() in stop_words]
    return filtered_tokens, removed_stopwords

# Noise removal
def remove_noise(tokens):
    # Remove punctuation and special characters
    cleaned_tokens = [word for word in tokens if word.isalpha()]
    removed_noise = [word for word in tokens if not word.isalpha()]
    return cleaned_tokens, removed_noise

# Process abbreviations
def process_abbreviations(tokens):
    abbreviation_mapping = {
        'i.e.': 'that is',
        'e.g.': 'for example',
        'u.s.': 'United States'
    }
    processed_tokens = [abbreviation_mapping[word.lower()] if word.lower() in abbreviation_mapping else word for word in tokens]
    return processed_tokens

# Remove numbers
def remove_numbers(tokens):
    cleaned_tokens = [word for word in tokens if not word.isdigit()]
    removed_numbers = [word for word in tokens if word.isdigit()]
    return cleaned_tokens, removed_numbers

# Process each file in the dataset
results = []

for filename in os.listdir(data_dir):
    if filename.endswith('.txt'):
        file_path = os.path.join(data_dir, filename)
        tokens_before = tokenize_file(file_path)
        num_words_before = len(tokens_before)

        # Removing stop words
        tokens_after_stopwords, removed_stopwords = remove_stopwords(tokens_before)
        num_words_after_stopwords = len(tokens_after_stopwords)

        # Noise removal
        tokens_after_noise_removal, removed_noise = remove_noise(tokens_after_stopwords)
        num_words_after_noise_removal = len(tokens_after_noise_removal)

        # Process abbreviations
        tokens_after_abbreviations = process_abbreviations(tokens_after_noise_removal)

        # Remove numbers
        tokens_after_number_removal, removed_numbers = remove_numbers(tokens_after_abbreviations)
        num_words_after_number_removal = len(tokens_after_number_removal)

        # Store the results
        results.append({
            'File': filename,
            'Before (Tokenization)': num_words_before,
            'After (Stopwords)': num_words_after_stopwords,
            'After (Noise Removal)': num_words_after_noise_removal,
            'After (Abbreviations)': len(tokens_after_abbreviations),
            'After (Number Removal)': num_words_after_number_removal,
            'Removed Stopwords': removed_stopwords,
            'Removed Noise': removed_noise,
            'Removed Numbers': removed_numbers
        })

# Convert the results to a DataFrame
results_df = pd.DataFrame(results)

# Print the DataFrame
print(results_df)

# Print the words removed from every section
for index, row in results_df.iterrows():
    print(f"File: {row['File']}")
    print(f"Removed Stopwords: {row['Removed Stopwords']}")
    print(f"Removed Noise: {row['Removed Noise']}")
    print(f"Removed Numbers: {row['Removed Numbers']}")
    print("\n")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\win10\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\win10\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\win10\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


        File  Before (Tokenization)  After (Stopwords)  After (Noise Removal)  \
0    001.txt                    235                150                    116   
1    002.txt                    158                106                     81   
2    003.txt                    435                252                    182   
3    004.txt                    226                139                    101   
4    005.txt                    192                131                     87   
..       ...                    ...                ...                    ...   
506  507.txt                    275                186                    129   
507  508.txt                    153                112                     84   
508  509.txt                    252                167                    106   
509  510.txt                    354                212                    169   
510  511.txt                    263                162                    102   

     After (Abbreviations) 

In [6]:
import os
import re
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

# Define the file path for the BBC news dataset
data_dir = r'E:\New folder (2)\bbc\sport'  # Update with your dataset directory

# Tokenization using NLTK word tokenizer
def tokenize_file(file_path):
    with open(file_path, 'r') as file:
        content = file.read()
        tokens = word_tokenize(content)
    return tokens

# Removing stop words
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words and not word.isdigit()]
    removed_stopwords = [word for word in tokens if word.lower() in stop_words or word.isdigit()]
    return filtered_tokens, removed_stopwords

# Noise removal
def remove_noise(tokens):
    # Remove punctuation and special characters
    cleaned_tokens = [word for word in tokens if word.isalpha()]
    removed_noise = [word for word in tokens if not word.isalpha()]
    return cleaned_tokens, removed_noise

# Process each file in the dataset
results = []

for filename in os.listdir(data_dir):
    if filename.endswith('.txt'):
        file_path = os.path.join(data_dir, filename)
        tokens_before = tokenize_file(file_path)
        num_words_before = len(tokens_before)

        # Removing stop words
        tokens_after_stopwords, removed_stopwords = remove_stopwords(tokens_before)
        num_words_after_stopwords = len(tokens_after_stopwords)

        # Noise removal
        tokens_after_noise_removal, removed_noise = remove_noise(tokens_after_stopwords)
        num_words_after_noise_removal = len(tokens_after_noise_removal)

        # Store the results
        results.append({
            'File': filename,
            'Before (Tokenization)': num_words_before,
            'After (Stopwords)': num_words_after_stopwords,
            'After (Noise Removal)': num_words_after_noise_removal,
            'Removed Stopwords': removed_stopwords,
            'Removed Noise': removed_noise
        })

# Convert the results to a DataFrame
results_df = pd.DataFrame(results)

# Print the DataFrame
print(results_df)

# Print the words removed from every section
for index, row in results_df.iterrows():
    print(f"File: {row['File']}")
    print(f"Removed Stopwords: {row['Removed Stopwords']}")
    print(f"Removed Noise: {row['Removed Noise']}")
    print("\n")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\win10\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\win10\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


        File  Before (Tokenization)  After (Stopwords)  After (Noise Removal)  \
0    001.txt                    235                150                    116   
1    002.txt                    158                104                     81   
2    003.txt                    435                252                    182   
3    004.txt                    226                138                    101   
4    005.txt                    192                130                     87   
..       ...                    ...                ...                    ...   
506  507.txt                    275                183                    129   
507  508.txt                    153                112                     84   
508  509.txt                    252                167                    106   
509  510.txt                    354                211                    169   
510  511.txt                    263                161                    102   

                           

In [7]:
import os
import re
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

# Define the file path for the BBC news dataset
data_dir = r'E:\New folder (2)\bbc\sport'  

# Tokenization using NLTK word tokenizer
def tokenize_file(file_path):
    with open(file_path, 'r') as file:
        content = file.read()
        tokens = word_tokenize(content)
    return tokens

# Removing stop words
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    removed_stopwords = [word for word in tokens if word.lower() in stop_words]
    return filtered_tokens, removed_stopwords

# Noise removal
def remove_noise(tokens):
    # Remove punctuation and special characters
    cleaned_tokens = [word for word in tokens if word.isalpha()]
    removed_noise = [word for word in tokens if not word.isalpha()]
    return cleaned_tokens, removed_noise

# Process each file in the dataset
results = []

for filename in os.listdir(data_dir):
    if filename.endswith('.txt'):
        file_path = os.path.join(data_dir, filename)
        tokens_before = tokenize_file(file_path)
        num_words_before = len(tokens_before)

        # Removing stop words
        tokens_after_stopwords, removed_stopwords = remove_stopwords(tokens_before)
        num_words_after_stopwords = len(tokens_after_stopwords)

        # Noise removal
        tokens_after_noise_removal, removed_noise = remove_noise(tokens_after_stopwords)
        num_words_after_noise_removal = len(tokens_after_noise_removal)

        # Store the results
        results.append({
            'File': filename,
            'Before (Tokenization)': tokens_before,
            'After (Stopwords)': tokens_after_stopwords,
            'After (Noise Removal)': tokens_after_noise_removal,
            'Removed Stopwords': removed_stopwords,
            'Removed Noise': removed_noise
        })

# Convert the results to a DataFrame
results_df = pd.DataFrame(results)

# Print the DataFrame
print(results_df)

# Print the words removed from every section
for index, row in results_df.iterrows():
    print(f"File: {row['File']}")
    print(f"Removed Stopwords: {row['Removed Stopwords']}")
    print(f"Removed Noise: {row['Removed Noise']}")
    print("\n")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\win10\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\win10\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


        File                              Before (Tokenization)  \
0    001.txt  [Claxton, hunting, first, major, medal, Britis...   
1    002.txt  [O'Sullivan, could, run, in, Worlds, Sonia, O'...   
2    003.txt  [Greene, sets, sights, on, world, title, Mauri...   
3    004.txt  [IAAF, launches, fight, against, drugs, The, I...   
4    005.txt  [Dibaba, breaks, 5,000m, world, record, Ethiop...   
..       ...                                                ...   
506  507.txt  [Big, guns, ease, through, in, San, Jose, Top-...   
507  508.txt  [Almagro, continues, Spanish, surge, Unseeded,...   
508  509.txt  [Melzer, shocks, Agassi, in, San, Jose, Second...   
509  510.txt  [Mirza, makes, Indian, tennis, history, Teenag...   
510  511.txt  [Roddick, to, face, Saulnier, in, final, Andy,...   

                                     After (Stopwords)  \
0    [Claxton, hunting, first, major, medal, Britis...   
1    [O'Sullivan, could, run, Worlds, Sonia, O'Sull...   
2    [Greene, sets, s

In [11]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

# Get the list of stopwords
stop_words = set(stopwords.words('english'))

# Print the stopwords
print(stop_words)


{'did', 'before', 'o', 'you', 'because', "you'd", 'off', 'not', "won't", 'its', 'these', 'up', 'shan', 'be', 'just', "you've", 'to', 'theirs', 'then', 'no', 'will', "wasn't", "isn't", "weren't", 'about', 've', "hadn't", "hasn't", 'our', 'she', 'nor', 'this', 'whom', 'same', 'doing', 'further', 'needn', 'from', 'there', 'it', 'him', 'once', 'too', 'what', 'on', 'over', 'an', 'does', 'your', "should've", "mightn't", 'doesn', 'am', 's', 'until', 'when', 'both', 'during', "mustn't", 'couldn', "don't", 'was', 'shouldn', 'if', 'into', 'as', 're', 'while', 'mightn', 't', 'below', 'weren', 'with', 'had', 'own', 'themselves', 'can', 'where', 'any', 'd', 'in', 'their', 'them', "needn't", 'that', 'the', 'more', "she's", 'myself', "couldn't", 'wouldn', 'yourself', 'between', 'has', 'how', 'hers', 'who', 'each', "that'll", 'herself', 'few', 'should', 'yours', 'why', 'his', 'itself', 'by', 'i', 'having', 'is', 'ours', 'they', 'haven', 'but', 'here', 'only', 'my', 'm', 'very', 'yourselves', "you're",

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\win10\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
print(removed_stopwords)

['to', 'in', 'will', 'in', 'the', 'of', 'the', 'in', 'on', 'The', 'and', 'the', 'I', 'was', 'in', 'the', 'I', 'was', 'a', 'in', 'the', 'to', 'a', 'over', 'who', 'his', 'in', 'the', 'won', 'the', 'of', 'the', 'before', 'being', 'at', 'the', 'of', 'the', 'But', 'he', 'and', 'then', 'again', 'to', 'It', 'when', 'you', 'have', 'against', 'a', 'and', 'do', 'do', 'with', 'them', 'I', 'a', 'few', 'and', 'he', 'will', 'into', 'the', 'for', 'the', 'after', 'his', 'through', 'to', 'the', 'It', 'a', 'of', 'and', 'a', 'of', 'in', 'my', 'he', 'I', 'did', 'I', 'to', 'a', 'and', 'now', 'I', 'am', 'here', 'I', 'I', 'on', 'the', 'I', 'and', 'a', 'and', 'I', 'be', 'up', 'there']


In [10]:
print(removed_noise)

['.', "'s", ',', ',', '7-6', '(', '7-3', ')', '6-3', '.', '``', ',', "''", '.', '``', '.', "''", '6-7', '(', '3-7', ')', '6-3', '6-3', ',', '.', 'first-set', 'tie-break', '.', '4-2', '.', '``', "'s", 'top-five', "n't", ',', "''", '.', '``', '.', "''", "'s", '50', '.', '``', "'s", ',', "''", '.', '``', "n't", '.', "'ve", '.', '``', "'m", '.', "'ll", "'ll", '.', "''"]
