In [151]:
# Importing the datasets
import warnings

import nltk
import numpy as np
import pandas as pd
from nltk.data import load
from nltk.tokenize import regexp_tokenize

warnings.filterwarnings('ignore')

In [152]:
pd.set_option('display.max_columns', None)

In [153]:
def save_to_csv(dataset, file_path, file_name):
    """Function to save the dataset in csv format.
    Parameters:
    -----------
    dataset: pandas dataframe
        The dataset to be saved in the csv format.
    file_path: String
        The path where the dataset is to be stored.
    file_name: String
        The name of the saved file.
    """
    complete_file_path_with_name = file_path + file_name
    dataset.to_csv(complete_file_path_with_name, index=False)

In [154]:
def create_tokens(sentence):
    '''Function to create tokens from the sentences using
    regexp_tokenize that only extracts alphanumeric characters.
    
    Parameters:
    -----------
    sentence: string
        The sentence which is to be tokenized.
        
    Returns:
    --------
    tokens: list
        List of all the tokens in the sentence.
    '''
    tokens = regexp_tokenize(sentence, pattern=r'\w+')
    return tokens

In [155]:
def create_tag_dict(tokens):
    '''Function to create a dictionary with the tag as key
    and count of tag in sentence as value.
    
    Parameters:
    -----------
    tokens: list
        Contains the tokens whose tag and count is to be formed.
        
    Returns:
    --------
    tag_list: list
        List of tags in the sentence
    tag_count: list
        List containing the value of corresponding element
        in tag_list
    '''
    tag_tuple = nltk.pos_tag(tokens)
    tags = [a[1] for a in tag_tuple]
    tag_set = list(set(tags))
    # Creating tag_dict where key is the tag and value is the count
    tag_dict = {ele: tags.count(ele) for ele in tag_set}
    return list(tag_dict.keys()), list(tag_dict.values())

In [156]:
def create_pos_tag(dataset, taglist):
    '''Function to create a dataset with columns as pos tag.
    
    Parameters:
    -----------
    dataset: pandas dataframe
        The dataset whose sentences are to be converted
        into pos tags.
    taglist: list
        List containing the available tag names.
        
    Returns:
    --------
    pos_dataset: pandas dataset
        Dataset with the columns as the taglist and the count of
        the sentences for each row of dataset.
    '''
    pos_dataset = pd.DataFrame(columns=taglist)
    
    # Accessing each row in the dataset
    for _, row in dataset.iterrows():
        # Tokenize the sentence of each row
        tokens = create_tokens(row['news'])
        # Create tag_list and tag_count of the sentence
        tag_list, tag_count = create_tag_dict(tokens)
        df1 = pd.DataFrame([tag_count], columns=tag_list)
        pos_dataset = pos_dataset.append(df1, sort=True)
        
    # Resetting the index of the pos_dataset
    pos_dataset = pos_dataset.reset_index(drop=True)
    # Adding the label of the dataset to pos_dataset
    pos_dataset['label'] = dataset['label']
    # Filling NaNs with 0
    pos_dataset.fillna(0, inplace=True)
    
    return pos_dataset

In [157]:
# Importing the dataset
train_data = pd.read_csv('../datasets/train.csv')
valid_data = pd.read_csv('../datasets/valid.csv')
test_data = pd.read_csv('../datasets/test.csv')

In [158]:
train_data.sample(5)

Unnamed: 0,label,news
820,True,Says an array of statistics show that conditio...
1849,True,Two-thirds of the people who start out in mini...
4388,True,Texans spend $2.5 billion gambling in our neig...
2797,False,Says Mike Dovilla supports a plan that could a...
1254,False,Says Hillary Clinton wants to have open borders.


In [159]:
valid_data.sample(5)

Unnamed: 0,label,news
235,True,Marcy Kaptur voted against a ban which would h...
709,True,"Over the last 10 years, incomes for the top 1 ..."
553,True,"Energy nominee Steven Chu has called coal ""his..."
65,False,Says that when Democrats controlled Congress a...
1157,False,Says Barack Obama is a Muslim.


In [160]:
test_data.sample(5)

Unnamed: 0,label,news
754,True,Congress used earmarks for more than 200 years.
639,False,The U.S. Supreme Court struck down Wisconsins ...
917,True,Florida is enjoying its lowest crime rate in 3...
444,True,"While 9,000 state employees were added to the ..."
799,True,"To this day, (the Cuban government) is a regim..."


In [161]:
import nltk
nltk.download('tagsets_json')
from nltk.help import upenn_tagset
upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

[nltk_data] Downloading package tagsets_json to
[nltk_data]     C:\Users\Selvaram\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets_json is already up-to-date!


#### Creating POS tag dataset

In [162]:
import nltk
nltk.download('tagsets_json')
from nltk.help import upenn_tagset
upenn_tagset()  # This prints the tag descriptions


$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

[nltk_data] Downloading package tagsets_json to
[nltk_data]     C:\Users\Selvaram\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets_json is already up-to-date!


In [163]:
import nltk
nltk.download('tagsets_json')
import json
from nltk.data import find

with open(find('help/tagsets_json/PY3_json/upenn_tagset.json')) as f:
    tagdict = json.load(f)

taglist = list(tagdict.keys())



[nltk_data] Downloading package tagsets_json to
[nltk_data]     C:\Users\Selvaram\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets_json is already up-to-date!


In [164]:
import nltk
from nltk.tokenize import regexp_tokenize
import pandas as pd

# Download required NLTK resources
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')  # If you use word_tokenize elsewhere

# Tokenizer function
def create_tokens(sentence):
    return regexp_tokenize(sentence, pattern=r'\w+')

# POS tag dictionary builder
def create_tag_dict(tokens):
    tag_tuple = nltk.pos_tag(tokens)
    tags = [tag for _, tag in tag_tuple]
    tag_set = list(set(tags))
    tag_count = {tag: tags.count(tag) for tag in tag_set}
    return tag_set, tag_count

# POS tagging for entire dataset
def create_pos_tag(dataset, taglist):
    pos_dataset = pd.DataFrame(columns=taglist)
    for _, row in dataset.iterrows():
        tokens = create_tokens(row['news'])
        tag_list, tag_count = create_tag_dict(tokens)
        df_row = pd.DataFrame([tag_count], columns=taglist)
        pos_dataset = pd.concat([pos_dataset, df_row], ignore_index=True, sort=True)
    return pos_dataset.fillna(0)


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Selvaram\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Selvaram\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### Checking if the labels in the new dataset are correct 

In [165]:
test_data['label'].value_counts()

label
True     714
False    553
Name: count, dtype: int64

In [166]:
import nltk
from nltk.tokenize import regexp_tokenize
import pandas as pd

# Download required NLTK resources
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')  # Optional, if using word_tokenize elsewhere

# Tokenizer function
def create_tokens(sentence):
    return regexp_tokenize(sentence, pattern=r'\w+')

# POS tag dictionary builder
def create_tag_dict(tokens):
    tag_tuple = nltk.pos_tag(tokens)
    tags = [tag for _, tag in tag_tuple]
    tag_set = list(set(tags))
    tag_count = {tag: tags.count(tag) for tag in tag_set}
    return tag_set, tag_count

# POS tagging for entire dataset
def create_pos_tag(dataset, taglist):
    pos_dataset = pd.DataFrame(columns=taglist)
    for _, row in dataset.iterrows():
        tokens = create_tokens(row['news'])
        tag_list, tag_count = create_tag_dict(tokens)
        df_row = pd.DataFrame([tag_count], columns=taglist)
        pos_dataset = pd.concat([pos_dataset, df_row], ignore_index=True, sort=True)
    pos_dataset = pos_dataset.fillna(0)
    pos_dataset['label'] = dataset['label'].values  # Add label column back
    return pos_dataset


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Selvaram\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Selvaram\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [167]:
train_data['label'].value_counts()

label
True     5752
False    4488
Name: count, dtype: int64

In [168]:
import nltk
from nltk.tokenize import regexp_tokenize
import pandas as pd

# Download required NLTK resources
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')  # Optional, if using word_tokenize elsewhere

# Tokenizer function
def create_tokens(sentence):
    return regexp_tokenize(sentence, pattern=r'\w+')

# POS tag dictionary builder
def create_tag_dict(tokens):
    tag_tuple = nltk.pos_tag(tokens)
    tags = [tag for _, tag in tag_tuple]
    tag_set = list(set(tags))
    tag_count = {tag: tags.count(tag) for tag in tag_set}
    return tag_set, tag_count

# POS tagging for entire dataset
def create_pos_tag(dataset, taglist):
    pos_dataset = pd.DataFrame(columns=taglist)
    for _, row in dataset.iterrows():
        tokens = create_tokens(row['news'])
        tag_list, tag_count = create_tag_dict(tokens)
        df_row = pd.DataFrame([tag_count], columns=taglist)
        pos_dataset = pd.concat([pos_dataset, df_row], ignore_index=True, sort=True)
    pos_dataset = pos_dataset.fillna(0)
    pos_dataset['label'] = dataset['label'].values  # Add label column back
    return pos_dataset


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Selvaram\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Selvaram\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [169]:
import nltk
from nltk.tokenize import regexp_tokenize
import pandas as pd

# Download required NLTK resources
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')  # Optional, if using word_tokenize elsewhere

# Tokenizer function
def create_tokens(sentence):
    return regexp_tokenize(sentence, pattern=r'\w+')

# POS tag dictionary builder
def create_tag_dict(tokens):
    tag_tuple = nltk.pos_tag(tokens)
    tags = [tag for _, tag in tag_tuple]
    tag_set = list(set(tags))
    tag_count = {tag: tags.count(tag) for tag in tag_set}
    return tag_set, tag_count

# POS tagging for entire dataset
def create_pos_tag(dataset, taglist):
    pos_dataset = pd.DataFrame(columns=taglist)
    for _, row in dataset.iterrows():
        tokens = create_tokens(row['news'])
        tag_list, tag_count = create_tag_dict(tokens)
        df_row = pd.DataFrame([tag_count], columns=taglist)
        pos_dataset = pd.concat([pos_dataset, df_row], ignore_index=True, sort=True)
    pos_dataset = pos_dataset.fillna(0)
    pos_dataset['label'] = dataset['label'].values  # Add label column back
    return pos_dataset


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Selvaram\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Selvaram\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [170]:
valid_data['label'].value_counts()

label
True     668
False    616
Name: count, dtype: int64

In [171]:
import nltk
from nltk.tokenize import regexp_tokenize
import pandas as pd

# Download required NLTK resources
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')  # Optional, if using word_tokenize elsewhere

# Tokenizer function
def create_tokens(sentence):
    return regexp_tokenize(sentence, pattern=r'\w+')

# POS tag dictionary builder
def create_tag_dict(tokens):
    tag_tuple = nltk.pos_tag(tokens)
    tags = [tag for _, tag in tag_tuple]
    tag_set = list(set(tags))
    tag_count = {tag: tags.count(tag) for tag in tag_set}
    return tag_set, tag_count

# POS tagging for entire dataset
def create_pos_tag(dataset, taglist):
    pos_dataset = pd.DataFrame(columns=taglist)
    for _, row in dataset.iterrows():
        tokens = create_tokens(row['news'])
        tag_list, tag_count = create_tag_dict(tokens)
        df_row = pd.DataFrame([tag_count], columns=taglist)
        pos_dataset = pd.concat([pos_dataset, df_row], ignore_index=True, sort=True)
    pos_dataset = pos_dataset.fillna(0)
    pos_dataset['label'] = dataset['label'].values  # Add label column back
    return pos_dataset

    


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Selvaram\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Selvaram\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [172]:
import nltk
from nltk.tokenize import regexp_tokenize
import pandas as pd

# Download required NLTK resources
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')  # Optional, if using word_tokenize elsewhere

# Tokenizer function
def create_tokens(sentence):
    return regexp_tokenize(sentence, pattern=r'\w+')

# POS tag dictionary builder
def create_tag_dict(tokens):
    tag_tuple = nltk.pos_tag(tokens)
    tags = [tag for _, tag in tag_tuple]
    tag_set = list(set(tags))
    tag_count = {tag: tags.count(tag) for tag in tag_set}
    return tag_set, tag_count

# POS tagging for entire dataset
def create_pos_tag(dataset, taglist):
    pos_dataset = pd.DataFrame(columns=taglist)
    for _, row in dataset.iterrows():
        tokens = create_tokens(row['news'])
        tag_list, tag_count = create_tag_dict(tokens)
        df_row = pd.DataFrame([tag_count], columns=taglist)
        pos_dataset = pd.concat([pos_dataset, df_row], ignore_index=True, sort=True)
    pos_dataset = pos_dataset.fillna(0)
    pos_dataset['label'] = dataset['label'].values  # Add label column back
    return pos_dataset


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Selvaram\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Selvaram\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [173]:
def create_tag_dict(tokens):
    tag_tuple = nltk.pos_tag(tokens)
    tags = [tag for _, tag in tag_tuple]
    tag_set = list(set(tags))
    tag_count = {tag: tags.count(tag) for tag in tag_set}
    return tag_set, tag_count



In [174]:
import nltk
from nltk.tokenize import regexp_tokenize
import pandas as pd

# Download required NLTK resources
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')  # Optional, if using word_tokenize elsewhere

# Tokenizer function
def create_tokens(sentence):
    return regexp_tokenize(sentence, pattern=r'\w+')

# POS tag dictionary builder using built-in tagger
def create_tag_dict(tokens):
    tag_tuple = nltk.pos_tag(tokens)
    tags = [tag for _, tag in tag_tuple]
    tag_set = list(set(tags))
    tag_count = {tag: tags.count(tag) for tag in tag_set}
    return tag_set, tag_count

# POS tagging for entire dataset
def create_pos_tag(dataset, taglist):
    pos_dataset = pd.DataFrame(columns=taglist)
    for _, row in dataset.iterrows():
        tokens = create_tokens(row['news'])
        tag_list, tag_count = create_tag_dict(tokens)
        df_row = pd.DataFrame([tag_count], columns=taglist)
        pos_dataset = pd.concat([pos_dataset, df_row], ignore_index=True, sort=True)
    pos_dataset = pos_dataset.fillna(0)
    pos_dataset['label'] = dataset['label'].values  # Add label column back
    return pos_dataset


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Selvaram\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Selvaram\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [175]:
import nltk
from nltk.tokenize import regexp_tokenize
import pandas as pd

# Download required NLTK resources
nltk.download('averaged_perceptron_tagger_eng')  # Required for newer NLTK versions
nltk.download('punkt')  # Optional, if using word_tokenize elsewhere

# Tokenizer function
def create_tokens(sentence):
    return regexp_tokenize(sentence, pattern=r'\w+')

# POS tag dictionary builder using built-in tagger
def create_tag_dict(tokens):
    tag_tuple = nltk.pos_tag(tokens, lang='eng')  # Explicitly use 'eng' to match downloaded model
    tags = [tag for _, tag in tag_tuple]
    tag_set = list(set(tags))
    tag_count = {tag: tags.count(tag) for tag in tag_set}
    return tag_set, tag_count

# POS tagging for entire dataset
def create_pos_tag(dataset, taglist):
    pos_dataset = pd.DataFrame(columns=taglist)
    for _, row in dataset.iterrows():
        tokens = create_tokens(row['news'])
        tag_list, tag_count = create_tag_dict(tokens)
        df_row = pd.DataFrame([tag_count], columns=taglist)
        pos_dataset = pd.concat([pos_dataset, df_row], ignore_index=True, sort=True)
    pos_dataset = pos_dataset.fillna(0)
    pos_dataset['label'] = dataset['label'].values  # Add label column back
    return pos_dataset



[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Selvaram\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Selvaram\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### Saving the POS tag datasets in the disk 

In [176]:
import nltk
from nltk.tokenize import regexp_tokenize
import pandas as pd

# Download required NLTK resources
nltk.download('averaged_perceptron_tagger_eng')  # Required for newer NLTK versions
nltk.download('punkt')  # Optional, if using word_tokenize elsewhere

# Tokenizer function
def create_tokens(sentence):
    return regexp_tokenize(sentence, pattern=r'\w+')

# POS tag dictionary builder
def create_tag_dict(tokens):
    tag_tuple = nltk.pos_tag(tokens, lang='eng')  # Explicitly use 'eng' to match downloaded model
    tags = [tag for _, tag in tag_tuple]
    tag_set = list(set(tags))
    tag_count = {tag: tags.count(tag) for tag in tag_set}
    return tag_set, tag_count

# POS tagging for entire dataset
def create_pos_tag(dataset, taglist):
    pos_dataset = pd.DataFrame(columns=taglist)
    for _, row in dataset.iterrows():
        tokens = create_tokens(row['news'])
        tag_list, tag_count = create_tag_dict(tokens)
        df_row = pd.DataFrame([tag_count], columns=taglist)
        pos_dataset = pd.concat([pos_dataset, df_row], ignore_index=True, sort=True)
    pos_dataset = pos_dataset.fillna(0)
    pos_dataset['label'] = dataset['label'].values
    return pos_dataset

# CSV saving function
def save_to_csv(dataset, file_path, file_name):
    complete_file_path_with_name = file_path + file_name
    dataset.to_csv(complete_file_path_with_name, index=False)
import nltk
from nltk.tokenize import regexp_tokenize
import pandas as pd

# Download required NLTK resources
nltk.download('averaged_perceptron_tagger_eng')  # Required for newer NLTK versions
nltk.download('punkt')  # Optional, if using word_tokenize elsewhere

# Tokenizer function
def create_tokens(sentence):
    return regexp_tokenize(sentence, pattern=r'\w+')

# POS tag dictionary builder
def create_tag_dict(tokens):
    tag_tuple = nltk.pos_tag(tokens, lang='eng')  # Explicitly use 'eng' to match downloaded model
    tags = [tag for _, tag in tag_tuple]
    tag_set = list(set(tags))
    tag_count = {tag: tags.count(tag) for tag in tag_set}
    return tag_set, tag_count

# POS tagging for entire dataset
def create_pos_tag(dataset, taglist):
    pos_dataset = pd.DataFrame(columns=taglist)
    for _, row in dataset.iterrows():
        tokens = create_tokens(row['news'])
        tag_list, tag_count = create_tag_dict(tokens)
        df_row = pd.DataFrame([tag_count], columns=taglist)
        pos_dataset = pd.concat([pos_dataset, df_row], ignore_index=True, sort=True)
    pos_dataset = pos_dataset.fillna(0)
    pos_dataset['label'] = dataset['label'].values
    return pos_dataset

# CSV saving function
def save_to_csv(dataset, file_path, file_name):
    complete_file_path_with_name = file_path + file_name
    dataset.to_csv(complete_file_path_with_name, index=False)


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Selvaram\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Selvaram\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Selvaram\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Selvaram\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
