In [1]:
# Importing the datasets
import warnings

import nltk
import numpy as np
import pandas as pd
from nltk.data import load
from nltk.tokenize import regexp_tokenize

warnings.filterwarnings('ignore')

In [2]:
pd.set_option('max_columns', None)

In [3]:
def save_to_csv(dataset, file_path, file_name):
    """Function to save the dataset in csv format.
    Parameters:
    -----------
    dataset: pandas dataframe
        The dataset to be saved in the csv format.
    file_path: String
        The path where the dataset is to be stored.
    file_name: String
        The name of the saved file.
    """
    complete_file_path_with_name = file_path + file_name
    dataset.to_csv(complete_file_path_with_name, index=False)

In [4]:
def create_tokens(sentence):
    '''Function to create tokens from the sentences using
    regexp_tokenize that only extracts alphanumeric characters.
    
    Parameters:
    -----------
    sentence: string
        The sentence which is to be tokenized.
        
    Returns:
    --------
    tokens: list
        List of all the tokens in the sentence.
    '''
    tokens = regexp_tokenize(sentence, pattern=r'\w+')
    return tokens

In [5]:
def create_tag_dict(tokens):
    '''Function to create a dictionary with the tag as key
    and count of tag in sentence as value.
    
    Parameters:
    -----------
    tokens: list
        Contains the tokens whose tag and count is to be formed.
        
    Returns:
    --------
    tag_list: list
        List of tags in the sentence
    tag_count: list
        List containing the value of corresponding element
        in tag_list
    '''
    tag_tuple = nltk.pos_tag(tokens)
    tags = [a[1] for a in tag_tuple]
    tag_set = list(set(tags))
    # Creating tag_dict where key is the tag and value is the count
    tag_dict = {ele: tags.count(ele) for ele in tag_set}
    return list(tag_dict.keys()), list(tag_dict.values())

In [6]:
def create_pos_tag(dataset, taglist):
    '''Function to create a dataset with columns as pos tag.
    
    Parameters:
    -----------
    dataset: pandas dataframe
        The dataset whose sentences are to be converted
        into pos tags.
    taglist: list
        List containing the available tag names.
        
    Returns:
    --------
    pos_dataset: pandas dataset
        Dataset with the columns as the taglist and the count of
        the sentences for each row of dataset.
    '''
    pos_dataset = pd.DataFrame(columns=taglist)
    
    # Accessing each row in the dataset
    for _, row in dataset.iterrows():
        # Tokenize the sentence of each row
        tokens = create_tokens(row['news'])
        # Create tag_list and tag_count of the sentence
        tag_list, tag_count = create_tag_dict(tokens)
        df1 = pd.DataFrame([tag_count], columns=tag_list)
        pos_dataset = pos_dataset.append(df1, sort=True)
        
    # Resetting the index of the pos_dataset
    pos_dataset = pos_dataset.reset_index(drop=True)
    # Adding the label of the dataset to pos_dataset
    pos_dataset['label'] = dataset['label']
    # Filling NaNs with 0
    pos_dataset.fillna(0, inplace=True)
    
    return pos_dataset

In [7]:
# Importing the dataset
train_data = pd.read_csv('./datasets/train.csv')
valid_data = pd.read_csv('./datasets/valid.csv')
test_data = pd.read_csv('./datasets/test.csv')

In [8]:
train_data.sample(5)

Unnamed: 0,label,news
7219,False,Ronald Reagans signature on the 1986 amnesty a...
7022,False,$3 billion over the next five years will be ta...
8322,False,Obamacare isnt helping anyone.
8897,True,While 38 percent of (firearms) dealers that we...
685,True,Says Arizona congressional candidate Ray Strau...


In [9]:
valid_data.sample(5)

Unnamed: 0,label,news
277,False,High-speed rail would have cost Florida taxpay...
1190,True,John McCain stood up to the president and soun...
1031,False,Boyce gave (a lobbyists) wife a sensitive job ...
523,False,Says his much-discussed boots are made in Wisc...
663,True,Says he will protect your guaranteed benefits ...


In [10]:
test_data.sample(5)

Unnamed: 0,label,news
772,False,Already in Wisconsin we have seen fewer people...
944,False,Says the case of a Texas judge who refused to ...
859,True,Says they said it was impossible to balance a ...
834,False,"Because of voter fraud, Republican candidates ..."
810,True,"By some estimates, as few as 2 percent of the ..."


In [11]:
# Importing all the tagsets from nltk
tagdict = load('./help/tagsets/upenn_tagset.pickle')
taglist = list(tagdict.keys())
print('Number of tags: {}'.format(len(taglist)))
taglist

Number of tags: 45


['LS',
 'TO',
 'VBN',
 "''",
 'WP',
 'UH',
 'VBG',
 'JJ',
 'VBZ',
 '--',
 'VBP',
 'NN',
 'DT',
 'PRP',
 ':',
 'WP$',
 'NNPS',
 'PRP$',
 'WDT',
 '(',
 ')',
 '.',
 ',',
 '``',
 '$',
 'RB',
 'RBR',
 'RBS',
 'VBD',
 'IN',
 'FW',
 'RP',
 'JJR',
 'JJS',
 'PDT',
 'MD',
 'VB',
 'WRB',
 'NNP',
 'EX',
 'NNS',
 'SYM',
 'CC',
 'CD',
 'POS']

#### Creating POS tag dataset

In [12]:
train_data_pos = create_pos_tag(train_data, taglist)
train_data_pos.sample(5)

Unnamed: 0,$,'',(,),",",--,.,:,CC,CD,DT,EX,FW,IN,JJ,JJR,JJS,LS,MD,NN,NNP,NNPS,NNS,PDT,POS,PRP,PRP$,RB,RBR,RBS,RP,SYM,TO,UH,VB,VBD,VBG,VBN,VBP,VBZ,WDT,WP,WP$,WRB,``,label
8990,0,0,0,0,0,0,0,0,3,0,5,0,0,5,0,0,0,0,1,6,2,0,2,0,0,2,0,1,0,0,0,0,1,0,2,0,2,0,1,0,0,0,0,0,0,True
4005,0,0,0,0,0,0,0,0,0,0,3,0,0,1,1,1,0,0,0,3,0,0,3,0,0,0,0,1,0,0,0,0,1,0,1,0,1,0,1,0,0,0,0,0,0,True
7238,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,1,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,False
5738,0,0,0,0,0,0,0,0,0,3,1,0,0,0,0,0,0,0,0,1,4,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,False
8798,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,1,0,0,0,5,4,0,2,0,0,0,0,0,1,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,True


In [13]:
valid_data_pos = create_pos_tag(valid_data, taglist)
valid_data_pos.sample(5)

Unnamed: 0,$,'',(,),",",--,.,:,CC,CD,DT,EX,FW,IN,JJ,JJR,JJS,LS,MD,NN,NNP,NNPS,NNS,PDT,POS,PRP,PRP$,RB,RBR,RBS,RP,SYM,TO,UH,VB,VBD,VBG,VBN,VBP,VBZ,WDT,WP,WP$,WRB,``,label
1061,0,0,0,0,0,0,0,0,2,1,0,0,0,4,2,0,0,0,0,3,2,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,False
688,0,0,0,0,0,0,0,0,0,0,4,0,0,5,2,0,0,0,0,4,0,0,3,0,0,1,0,1,0,0,0,0,0,0,0,0,0,2,1,1,0,0,0,0,0,True
1246,0,0,0,0,0,0,0,0,0,0,2,0,0,2,0,0,1,0,0,4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,False
83,0,0,0,0,0,0,0,0,0,1,2,0,0,3,2,0,0,0,0,5,2,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,False
276,0,0,0,0,0,0,0,0,0,2,0,0,0,3,3,0,0,0,0,2,2,0,3,0,0,1,0,0,1,0,0,0,3,0,2,1,1,0,1,0,0,0,0,0,0,True


In [14]:
test_data_pos = create_pos_tag(test_data, taglist)
test_data_pos.sample(5)

Unnamed: 0,$,'',(,),",",--,.,:,CC,CD,DT,EX,FW,IN,JJ,JJR,JJS,LS,MD,NN,NNP,NNPS,NNS,PDT,POS,PRP,PRP$,RB,RBR,RBS,RP,SYM,TO,UH,VB,VBD,VBG,VBN,VBP,VBZ,WDT,WP,WP$,WRB,``,label
1091,0,0,0,0,0,0,0,0,0,0,2,0,0,2,1,0,0,0,0,4,2,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,2,0,0,0,0,0,False
751,0,0,0,0,0,0,0,0,0,0,3,0,0,2,1,0,0,0,0,2,4,0,2,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,False
632,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3,0,0,0,0,3,1,0,2,0,0,0,0,0,0,0,0,0,2,0,1,1,0,0,1,0,0,1,0,0,0,True
1003,0,0,0,0,0,0,0,0,0,3,2,0,0,3,1,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,True
565,0,0,0,0,0,0,0,0,0,0,2,0,0,2,0,0,0,0,0,2,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,False


#### Checking if the labels in the new dataset are correct 

In [15]:
test_data['label'].value_counts()

True     714
False    553
Name: label, dtype: int64

In [16]:
test_data_pos['label'].value_counts()

True     714
False    553
Name: label, dtype: int64

In [17]:
train_data['label'].value_counts()

True     5752
False    4488
Name: label, dtype: int64

In [18]:
train_data_pos['label'].value_counts()

True     5752
False    4488
Name: label, dtype: int64

In [19]:
valid_data_pos['label'].value_counts()

True     668
False    616
Name: label, dtype: int64

In [20]:
valid_data['label'].value_counts()

True     668
False    616
Name: label, dtype: int64

In [21]:
(train_data['label'] == train_data_pos['label']).all()

True

In [22]:
(test_data['label'] == test_data_pos['label']).all()

True

In [23]:
(valid_data['label'] == valid_data_pos['label']).all()

True

#### Saving the POS tag datasets in the disk 

In [24]:
save_path = './datasets/'
save_to_csv(train_data_pos, save_path, 'train_pos.csv')
save_to_csv(valid_data_pos, save_path, 'valid_pos.csv')
save_to_csv(test_data_pos, save_path, 'test_pos.csv')