In [1]:
# Importing the datasets
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

import nltk
from nltk.data import load
from nltk.tokenize import regexp_tokenize

In [2]:
pd.set_option('max_columns', None)

In [3]:
def save_to_csv(dataset, file_path, file_name):
    """Function to save the dataset in csv format.
    Parameters:
    -----------
    dataset: pandas dataframe
        The dataset to be saved in the csv format.
    file_path: String
        The path where the dataset is to be stored.
    file_name: String
        The name of the saved file.
    """
    complete_file_path_with_name = file_path + file_name
    dataset.to_csv(complete_file_path_with_name, index=False)

In [4]:
def create_tokens(sentence):
    '''Function to create tokens from the sentences using
    regexp_tokenize that only extracts alphanumeric characters.
    
    Parameters:
    -----------
    sentence: string
        The sentence which is to be tokenized.
        
    Returns:
    --------
    tokens: list
        List of all the tokens in the sentence.
    '''
    tokens = regexp_tokenize(sentence, pattern=r'\w+')
    return tokens

In [5]:
def create_tag_dict(tokens):
    '''Function to create a dictionary with the tag as key
    and count of tag in sentence as value.
    
    Parameters:
    -----------
    tokens: list
        Contains the tokens whose tag and count is to be formed.
        
    Returns:
    --------
    tag_list: list
        List of tags in the sentence
    tag_count: list
        List containing the value of corresponding element
        in tag_list
    '''
    tag_tuple = nltk.pos_tag(tokens)
    tags = [a[1] for a in tag_tuple]
    tag_set = list(set(tags))
    # Creating tag_dict where key is the tag and value is the count
    tag_dict = {ele:tags.count(ele) for ele in tag_set}
    return list(tag_dict.keys()), list(tag_dict.values())

In [6]:
def create_pos_tag(dataset, taglist):
    '''Function to create a dataset with columns as pos tag.
    
    Parameters:
    -----------
    dataset: pandas dataframe
        The dataset whose sentences are to be converted
        into pos tags.
    taglist: list
        List containing the available tag names.
        
    Returns:
    --------
    pos_dataset: pandas dataset
        Dataset with the columns as the taglist and the count of
        the sentences for each row of dataset.
    '''
    pos_dataset = pd.DataFrame(columns=taglist)
    
    # Accessing each row in the dataset
    for _, row in dataset.iterrows():
        # Tokenize the sentence of each row
        tokens = create_tokens(row['news'])
        # Create tag_list and tag_count of the sentence
        tag_list, tag_count = create_tag_dict(tokens)
        df1 = pd.DataFrame([tag_count], columns=tag_list)
        pos_dataset = pos_dataset.append(df1, sort=True)
        
    # Adding the label of the dataset to pos_dataset
    pos_dataset['label'] = dataset['label']
    # Filling NaNs with 0
    pos_dataset.fillna(0, inplace=True)
    # Resetting the index of the pos_dataset
    return pos_dataset.reset_index(drop=True)

In [7]:
# Importing the dataset
train_data = pd.read_csv('./datasets/train.csv')
valid_data = pd.read_csv('./datasets/valid.csv')
test_data = pd.read_csv('./datasets/test.csv')

In [8]:
train_data.sample(5)

Unnamed: 0,label,news
6355,True,A statewide poll showing 76 percent support an...
9235,True,The Great Lakes Compact has a loophole. And th...
4354,False,The average age of planes in the Qatar Airways...
6137,True,The rate of uninsured Americans (is) 8.8 percent.
231,True,Says David Perdue wants to abolish the U.S. De...


In [9]:
valid_data.sample(5)

Unnamed: 0,label,news
1082,True,"Right now, one third of all illegal aliens are..."
185,False,"Last year, Beaverton School District had the h..."
291,False,Test scores had gone up steadily for 40 years ...
842,True,Nearly 60 percent of women who use birth contr...
1085,False,National studies are already showing the negat...


In [10]:
test_data.sample(5)

Unnamed: 0,label,news
908,False,I never favored cap and trade.
1044,True,"I wrote to Secretary Paulson, I wrote to Feder..."
435,False,U.S. Rep. Jim Langevin didn't want a border fe...
281,False,Mitch McConnell voted with Harry Reid to infri...
340,True,Almost every state has offered an insurance pl...


In [11]:
# Importing all the tagsets from nltk
tagdict = load('./help/tagsets/upenn_tagset.pickle')
taglist = list(tagdict.keys())
print('Number of tags: {}'.format(len(taglist)))
taglist

Number of tags: 45


['LS',
 'TO',
 'VBN',
 "''",
 'WP',
 'UH',
 'VBG',
 'JJ',
 'VBZ',
 '--',
 'VBP',
 'NN',
 'DT',
 'PRP',
 ':',
 'WP$',
 'NNPS',
 'PRP$',
 'WDT',
 '(',
 ')',
 '.',
 ',',
 '``',
 '$',
 'RB',
 'RBR',
 'RBS',
 'VBD',
 'IN',
 'FW',
 'RP',
 'JJR',
 'JJS',
 'PDT',
 'MD',
 'VB',
 'WRB',
 'NNP',
 'EX',
 'NNS',
 'SYM',
 'CC',
 'CD',
 'POS']

#### Creating POS tag dataset

In [12]:
train_data_pos = create_pos_tag(train_data, taglist)
train_data_pos.sample(5)

Unnamed: 0,$,'',(,),",",--,.,:,CC,CD,DT,EX,FW,IN,JJ,JJR,JJS,LS,MD,NN,NNP,NNPS,NNS,PDT,POS,PRP,PRP$,RB,RBR,RBS,RP,SYM,TO,UH,VB,VBD,VBG,VBN,VBP,VBZ,WDT,WP,WP$,WRB,``,label
9545,0,0,0,0,0,0,0,0,1,0,2,0,0,0,1,0,0,0,0,2,2,0,0,0,0,1,0,0,0,0,0,0,2,0,1,2,0,0,0,1,0,0,0,0,0,False
1547,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,2,0,0,1,0,0,0,0,0,False
5600,0,0,0,0,0,0,0,0,2,0,3,0,0,1,0,0,0,0,0,5,0,0,3,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,False
4940,0,0,0,0,0,0,0,0,0,0,1,0,0,3,1,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,False
2133,0,0,0,0,0,0,0,0,0,0,1,0,0,3,2,0,0,0,0,4,4,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,2,1,0,1,0,0,0,0,False


In [13]:
valid_data_pos = create_pos_tag(valid_data, taglist)
valid_data_pos.sample(5)

Unnamed: 0,$,'',(,),",",--,.,:,CC,CD,DT,EX,FW,IN,JJ,JJR,JJS,LS,MD,NN,NNP,NNPS,NNS,PDT,POS,PRP,PRP$,RB,RBR,RBS,RP,SYM,TO,UH,VB,VBD,VBG,VBN,VBP,VBZ,WDT,WP,WP$,WRB,``,label
1085,0,0,0,0,0,0,0,0,2,0,5,0,0,5,2,0,0,0,0,7,1,0,5,0,0,0,2,1,0,0,0,0,0,0,0,0,4,0,2,2,0,0,0,0,0,False
629,0,0,0,0,0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,1,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,False
1080,0,0,0,0,0,0,0,0,1,0,4,1,0,5,2,0,0,0,0,3,1,0,2,1,0,1,0,3,0,0,0,0,0,0,0,1,1,0,1,1,0,0,0,0,0,False
513,0,0,0,0,0,0,0,0,1,0,2,0,0,3,1,0,0,0,1,5,4,0,2,0,0,1,1,0,0,0,1,0,0,0,2,0,0,0,2,0,0,0,0,0,0,False
940,0,0,0,0,0,0,0,0,0,0,3,0,0,2,1,0,0,0,0,3,1,0,1,0,0,1,0,0,0,0,0,0,1,0,1,0,1,0,1,2,0,0,0,1,0,False


In [14]:
test_data_pos = create_pos_tag(test_data, taglist)
test_data_pos.sample(5)

Unnamed: 0,$,'',(,),",",--,.,:,CC,CD,DT,EX,FW,IN,JJ,JJR,JJS,LS,MD,NN,NNP,NNPS,NNS,PDT,POS,PRP,PRP$,RB,RBR,RBS,RP,SYM,TO,UH,VB,VBD,VBG,VBN,VBP,VBZ,WDT,WP,WP$,WRB,``,label
1109,0,0,0,0,0,0,0,0,0,0,3,0,0,3,4,0,1,0,0,2,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2,0,0,0,0,0,True
492,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,2,3,0,3,0,0,1,1,0,0,0,0,0,1,0,0,0,1,1,0,2,0,0,0,0,0,True
1077,0,0,0,0,0,0,0,0,1,5,1,1,0,4,0,0,0,0,0,2,1,1,2,0,0,0,0,3,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,True
769,0,0,0,0,0,0,0,0,0,0,1,0,0,4,0,1,0,0,0,2,5,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,True
192,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,1,3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,True


#### Saving the POS tag datasets in the disk 

In [15]:
save_path = './datasets/'
save_to_csv(train_data_pos, save_path, 'train_pos.csv')
save_to_csv(valid_data_pos, save_path, 'valid_pos.csv')
save_to_csv(test_data_pos, save_path, 'test_pos.csv')