This notebook follows the Exploratory Data Analysis (EDA) steps, where the data has been cleaned, analysed, and saved into an HDF5 file (eda_processed_data.h5). The next step in the process is vectorisation, where the textual data will be transformed into a numerical format suitable for machine learning models.

# Import Necessary Libraries

In [9]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import h5py
import os
import pickle

# Load the EDA Processed Data

In [10]:
# Load the cleaned DataFrame from the EDA step
df = pd.read_hdf('../data/processed/eda_processed_data.h5', key='df')

# Display basic information about the DataFrame to ensure it's loaded correctly
print(df.info())



<class 'pandas.core.frame.DataFrame'>
Index: 72095 entries, 0 to 72133
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   title            72095 non-null  object 
 1   text             72095 non-null  object 
 2   label            72095 non-null  int64  
 3   combined_text    72095 non-null  object 
 4   cleaned_text     72095 non-null  object 
 5   text_length      72095 non-null  int64  
 6   log_text_length  72095 non-null  float64
dtypes: float64(1), int64(2), object(4)
memory usage: 4.4+ MB
None


In [11]:
df.head()

Unnamed: 0,title,text,label,combined_text,cleaned_text,text_length,log_text_length
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,law enforcement high alert following threats c...,3311,8.105308
1,No Title,Did they post their votes for Hillary already?,1,No Title Did they post their votes for Hillary...,title post vote hillary already,31,3.465736
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,unbelievable obamas attorney general says char...,277,5.627621
3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,"Bobby Jindal, raised Hindu, uses story of Chri...",bobby jindal raised hindu us story christian c...,5611,8.632662
4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,SATAN 2: Russia unvelis an image of its terrif...,satan russia unvelis image terrifying new supe...,1388,7.236339


# Vectorisation Using TF-IDF

In [12]:
# Load the tokeniser
with open('../data/processed/tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

# Get the word counts (how many times each word appears)
word_counts = tokenizer.word_counts

# Sort words by frequency
sorted_word_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)

# Calculate the cumulative coverage
cumulative_coverage = []
cumulative_count = 0
total_word_count = sum(word_counts.values())

for word, count in sorted_word_counts:
    cumulative_count += count
    cumulative_coverage.append(cumulative_count / total_word_count)

# Calculate num_words for 95% coverage
coverage_95 = next(i for i, coverage in enumerate(cumulative_coverage) if coverage >= 0.95) + 1

print(f"Number of words covering 95% of the dataset: {coverage_95}")


Number of words covering 95% of the dataset: 24016


In [13]:
# Initialise the TF-IDF vectoriser
vectorizer = TfidfVectorizer(max_features=coverage_95)

# Fit the vectoriser on the cleaned text data and transform the text into numerical vectors
X = vectorizer.fit_transform(df['cleaned_text'])

# Display the shape of the resulting matrix
print(f"TF-IDF matrix shape: {X.shape}")


TF-IDF matrix shape: (72095, 24016)


In [14]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 14011401 stored elements and shape (72095, 24016)>
  Coords	Values
  (0, 12161)	0.019370036721062953
  (0, 7083)	0.02949693199829052
  (0, 9936)	0.02406240763969145
  (0, 590)	0.040712493800785995
  (0, 8337)	0.02450899386999259
  (0, 21589)	0.053509902853638734
  (0, 4669)	0.047300273891901674
  (0, 23506)	0.05312875833462724
  (0, 2965)	0.03751438571910205
  (0, 2276)	0.10475617682262549
  (0, 21420)	0.04725778319544691
  (0, 22994)	0.03835636737844158
  (0, 4105)	0.02144314913991036
  (0, 7572)	0.02535852767633391
  (0, 1812)	0.024378653458474744
  (0, 14712)	0.019096323007148988
  (0, 13365)	0.04215941540411928
  (0, 13967)	0.08386012954008269
  (0, 3033)	0.056420258884790485
  (0, 12762)	0.11368256675782971
  (0, 9593)	0.04299750377866516
  (0, 23500)	0.1634625276400689
  (0, 15699)	0.11814999116244773
  (0, 4662)	0.15308411829605043
  (0, 7038)	0.03839304903211266
  :	:
  (72094, 6675)	0.04096088366767763
  (72094, 210

In [18]:
df.head()

Unnamed: 0,title,text,label,combined_text,cleaned_text,text_length,log_text_length
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,law enforcement high alert following threats c...,3311,8.105308
1,No Title,Did they post their votes for Hillary already?,1,No Title Did they post their votes for Hillary...,title post vote hillary already,31,3.465736
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,unbelievable obamas attorney general says char...,277,5.627621
3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,"Bobby Jindal, raised Hindu, uses story of Chri...",bobby jindal raised hindu us story christian c...,5611,8.632662
4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,SATAN 2: Russia unvelis an image of its terrif...,satan russia unvelis image terrifying new supe...,1388,7.236339


In [17]:
# Convert the TF-IDF matrix to a dense format
X_dense = X.toarray()

# Create the directory if it does not exist
os.makedirs('../data/vectorised', exist_ok=True)

# Save the TF-IDF matrix in HDF5 format
with h5py.File('../data/vectorised/tfidf_vectorised_data.h5', 'w') as f:
    f.create_dataset('X_tfidf', data=X_dense)

# Save the corresponding DataFrame with other features and labels
df.to_hdf('../data/vectorised/vectorised_data_df.h5', key='df', mode='w')

print("Vectorised data has been saved to '../data/vectorised/'.")

Vectorised data has been saved to '../data/vectorised/'.
