### <center>Hansard Data</center>

+ In this notebook we will download the hansard data of parliamentary speeches from Zendo.org. Extract the time duration of speeches where we will perform the analysis.
+ Perform text processing on the data by cleaning data. Saving the data as the pickle file.

In [17]:
import re
import wget
import string
import demoji
import contractions

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from warnings import filterwarnings
filterwarnings(action='ignore', category=DeprecationWarning)

from utils.utils import save_pickle_file

%matplotlib inline

In [3]:
## Importing nltk package for preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\joshi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\joshi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
## Download the data from the url

data_url = "https://zenodo.org/record/4843485/files/hansard-speeches-v310.csv.zip"
post_url = "https://zenodo.org/record/4843485/files/parliamentary_posts.json"

hansard_data = wget.download(data_url)
parliamentary_post = wget.download(post_url)

Reading the data and performing pre-processing

In [9]:
def download_read_data():
    """ The function reads the data and returns the dataframe
     url - url of website """

    repo_file_names = [
            r'.\data\raw\hansard-speeches-v310.csv.zip',
            r'.\data\raw\parliamentary_posts.json'
    ]
    df = pd.read_csv(repo_file_names[0],
                    dtype=str,
                    parse_dates=['date'])
    return df

## Note: Hansard files were download using wget
# !wget "https://zenodo.org/record/4843485/files/hansard-speeches-v310.csv.zip"
# !wget "https://zenodo.org/record/4843485/files/parliamentary_posts.json"

def extract_data_datewise(df, date_from, date_to):
    """ The function extract the specific date data from the dataframe
     df - Dataframe
     date_from - starting date
     date_to - end date"""

    date_from = pd.to_datetime(date_from)
    date_to = pd.to_datetime(date_to)

    df_date = df[(date_from <= df.date) & (df.date <= date_to)]
    return df_date


def drop_columns(df, drop_cols):
    """ The function drop the list of columns in the dataframe
     df - Dataframe
     drop_cols - list of columns to be dropped """

    df = df.drop(labels=drop_cols, axis=1)
    df = df.replace('\n','', regex=True)
    # Removing Procedural speeches
    
    df = df[df['speech_class']!="Procedural"]
    return df

def preprocess(text_col):
    """The function will apply NLP preprocessing lambda functions over a pandas series.
       These functions include converting text to lowercase, removing emojis, expanding contractions, removing punctuation,
       removing numbers, removing stopwords, lemmatization"""

    additional_stopwords = ["thing","give","try","look","therefore","go","hon","use","health"]

    # convert to lowercase
    text_col = text_col.apply(lambda x: ' '.join([w.lower() for w in x.split()]))
    # remove emojis
    text_col = text_col.apply(lambda x: demoji.replace(x, ""))
    # expand contractions  
    text_col = text_col.apply(lambda x: ' '.join([contractions.fix(word) for word in x.split()]))
    # remove punctuation
    text_col = text_col.apply(lambda x: ''.join([i for i in x if i not in string.punctuation]))
    # remove numbers
    text_col = text_col.apply(lambda x: ' '.join(re.sub("[^a-zA-Z]+", " ", x).split()))
    # remove stopwords
    stopwords = [sw for sw in list(nltk.corpus.stopwords.words('english')) + additional_stopwords if sw not in ['not']]
    text_col = text_col.apply(lambda x: ' '.join([w for w in x.split() if w not in stopwords]))
    # lemmatization
    text_col = text_col.apply(lambda x: ' '.join([WordNetLemmatizer().lemmatize(w) for w in x.split()]))
    # remove short words
    text_col = text_col.apply(lambda x: ' '.join([w.strip() for w in x.split() if len(w.strip()) >= 3]))

    return text_col

def extract_process_data(df, date_from, date_to):
    drop_cols = ['id', 'display_as', 'party', 'constituency', 'mnis_id', 'time', 'colnum', 'oral_heading', 'year', 'hansard_membership_id',
            'speakerid', 'person_id', 'speakername', 'url']

    # Extracting data for an year for analysis
    df_extract = extract_data_datewise(df, date_from, date_to)
    # Dropping columns
    df_extract = drop_columns(df_extract, drop_cols)

    # Pre processing the speech of the MP's
    df_extract['speech_processed'] = preprocess(df_extract['speech'].astype(str))
    
    return df_extract

In [11]:
# Download the data from the url and reading the data
df = download_read_data()
df.head()

Unnamed: 0,id,speech,display_as,party,constituency,mnis_id,date,time,colnum,speech_class,major_heading,minor_heading,oral_heading,year,hansard_membership_id,speakerid,person_id,speakername,url
0,uk.org.publicwhip/debate/1979-05-09a.1.1,"The House being met; and, it being the first d...",Unknown,,,,1979-05-09,,1,Procedural,Preamble,,,1979,,,,Unknown,
1,uk.org.publicwhip/debate/1979-05-09a.1.2,Several of the Members repaired to their Seats.,Unknown,,,,1979-05-09,,1,Procedural,Preamble,,,1979,,,,Unknown,
2,uk.org.publicwhip/debate/1979-05-09a.1.3,"JOHN PARKER, ESQUIRE, took the Chair, pursuant...",Unknown,,,,1979-05-09,,1,Procedural,Preamble,,,1979,,,,Unknown,
3,uk.org.publicwhip/debate/1979-05-09a.1.4,Message to attend the Lords Commissioners;,Unknown,,,,1979-05-09,,1,Procedural,Preamble,,,1979,,,,Unknown,
4,uk.org.publicwhip/debate/1979-05-09a.1.5,The House went; and a Commission having been r...,Unknown,,,,1979-05-09,,1,Procedural,Preamble,,,1979,,,,Unknown,


In [12]:
## Extracting data for the year 2020-2021 for analysis
df_2021 = extract_process_data(df, '2020-04-29', '2021-04-29')
df_2021.head()

Unnamed: 0,speech,date,speech_class,major_heading,minor_heading,speech_processed
2635210,What assessment her Department has made of the...,2020-04-29,Speech,International Development,Covid-19: Developing Countries,assessment department made effect covid pandem...
2635211,I would first like to put on record my congrat...,2020-04-29,Speech,International Development,Covid-19: Developing Countries,would first like put record congratulation hou...
2635212,The United Nations has warned that the world i...,2020-04-29,Speech,International Development,Covid-19: Developing Countries,united nation warned world risk widespread fam...
2635213,Coronavirus is a global crisis that knows no b...,2020-04-29,Speech,International Development,Covid-19: Developing Countries,coronavirus global crisis know border profound...
2635214,"During the coronavirus pandemic, it is imperat...",2020-04-29,Speech,International Development,Covid-19: Developing Countries,coronavirus pandemic imperative country commun...


In [13]:
# Checking null values in the dataframe
df_2021.isna().sum()

speech                  0
date                    0
speech_class            0
major_heading           0
minor_heading       33710
speech_processed        0
dtype: int64

In [14]:
## Extracting data for the year 2019-2020 for analysis
df_2020 = extract_process_data(df, '2019-04-29', '2020-04-29')
df_2020.head()

Unnamed: 0,speech,date,speech_class,major_heading,minor_heading,speech_processed
2581635,What progress has been made on the Timpson rev...,2019-04-29,Speech,EDUCATION,School Exclusions: Timpson Review,progress made timpson review school exclusion
2581636,What progress has been made on the Timpson rev...,2019-04-29,Speech,EDUCATION,School Exclusions: Timpson Review,progress made timpson review school exclusion
2581637,What progress has been made on the Timpson rev...,2019-04-29,Speech,EDUCATION,School Exclusions: Timpson Review,progress made timpson review school exclusion
2581638,I am very grateful to Edward Timpson for the t...,2019-04-29,Speech,EDUCATION,School Exclusions: Timpson Review,grateful edward timpson thorough work leading ...
2581639,The all-party parliamentary group on knife cri...,2019-04-29,Speech,EDUCATION,School Exclusions: Timpson Review,allparty parliamentary group knife crime chair...


In [18]:
## Saving the processed data in the pickle file
filename_2021 = './data/hansard-speeches-processed-post2021.pkl'
filename_2020 = './data/hansard-speeches-processed-post2020.pkl'

save_pickle_file(df_2021, filename_2021)
save_pickle_file(df_2020, filename_2020)