In [2]:
import pandas as pd
import nltk

In [3]:
df = pd.read_json('../../output_data/1_3_tc_dataset.json')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25763 entries, 0 to 25762
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   25763 non-null  object
 1   tag     25763 non-null  object
dtypes: object(2)
memory usage: 402.7+ KB


In [5]:
df.head(4)

Unnamed: 0,title,tag
0,My personal ranking of Community's use of alte...,COMMUNITY
1,It took 30 years for climate tech investments ...,ENVIRONMENT
2,Rain when sitting in a car,RELAX
3,Is Cassis worth staying in?,TRAVEL


In [6]:
# Lower letter, clean title by removing all characters except lower letters, digits and whitespaces

import re

df['s1_title_lower'] = df['title'].apply(lambda text: text.lower())


df['s2_clean_title'] = df['s1_title_lower'].apply(lambda text: re.sub(r'[^a-z0-9\s]', '', text))

df.head(5)

Unnamed: 0,title,tag,s1_title_lower,s2_clean_title
0,My personal ranking of Community's use of alte...,COMMUNITY,my personal ranking of community's use of alte...,my personal ranking of communitys use of alter...
1,It took 30 years for climate tech investments ...,ENVIRONMENT,it took 30 years for climate tech investments ...,it took 30 years for climate tech investments ...
2,Rain when sitting in a car,RELAX,rain when sitting in a car,rain when sitting in a car
3,Is Cassis worth staying in?,TRAVEL,is cassis worth staying in?,is cassis worth staying in
4,New Research Suggests Some Black Holes May Act...,ASTRONOMY,new research suggests some black holes may act...,new research suggests some black holes may act...


In [7]:
# Tokenization

from nltk import word_tokenize
nltk.download('punkt') 

df['s3_tokenized'] = df['s2_clean_title'].apply(lambda text: word_tokenize(text))
df.head(5)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\reosk\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


Unnamed: 0,title,tag,s1_title_lower,s2_clean_title,s3_tokenized
0,My personal ranking of Community's use of alte...,COMMUNITY,my personal ranking of community's use of alte...,my personal ranking of communitys use of alter...,"[my, personal, ranking, of, communitys, use, o..."
1,It took 30 years for climate tech investments ...,ENVIRONMENT,it took 30 years for climate tech investments ...,it took 30 years for climate tech investments ...,"[it, took, 30, years, for, climate, tech, inve..."
2,Rain when sitting in a car,RELAX,rain when sitting in a car,rain when sitting in a car,"[rain, when, sitting, in, a, car]"
3,Is Cassis worth staying in?,TRAVEL,is cassis worth staying in?,is cassis worth staying in,"[is, cassis, worth, staying, in]"
4,New Research Suggests Some Black Holes May Act...,ASTRONOMY,new research suggests some black holes may act...,new research suggests some black holes may act...,"[new, research, suggests, some, black, holes, ..."


In [8]:
# Remove stopwords
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

stop_words = stopwords.words('english')

df['s4_no_stopwords'] = df['s3_tokenized'].apply(lambda word_list: [word for word in word_list if word not in stop_words])
df.head(5)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\reosk\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


Unnamed: 0,title,tag,s1_title_lower,s2_clean_title,s3_tokenized,s4_no_stopwords
0,My personal ranking of Community's use of alte...,COMMUNITY,my personal ranking of community's use of alte...,my personal ranking of communitys use of alter...,"[my, personal, ranking, of, communitys, use, o...","[personal, ranking, communitys, use, alternati..."
1,It took 30 years for climate tech investments ...,ENVIRONMENT,it took 30 years for climate tech investments ...,it took 30 years for climate tech investments ...,"[it, took, 30, years, for, climate, tech, inve...","[took, 30, years, climate, tech, investments, ..."
2,Rain when sitting in a car,RELAX,rain when sitting in a car,rain when sitting in a car,"[rain, when, sitting, in, a, car]","[rain, sitting, car]"
3,Is Cassis worth staying in?,TRAVEL,is cassis worth staying in?,is cassis worth staying in,"[is, cassis, worth, staying, in]","[cassis, worth, staying]"
4,New Research Suggests Some Black Holes May Act...,ASTRONOMY,new research suggests some black holes may act...,new research suggests some black holes may act...,"[new, research, suggests, some, black, holes, ...","[new, research, suggests, black, holes, may, a..."


In [9]:
# Lemmatize
import nltk

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

df['s5_lemmatized'] = df['s4_no_stopwords'].apply(lambda word_list: [lemmatizer.lemmatize(word) for word in word_list])
df.head(5)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\reosk\AppData\Roaming\nltk_data...


Unnamed: 0,title,tag,s1_title_lower,s2_clean_title,s3_tokenized,s4_no_stopwords,s5_lemmatized
0,My personal ranking of Community's use of alte...,COMMUNITY,my personal ranking of community's use of alte...,my personal ranking of communitys use of alter...,"[my, personal, ranking, of, communitys, use, o...","[personal, ranking, communitys, use, alternati...","[personal, ranking, community, use, alternativ..."
1,It took 30 years for climate tech investments ...,ENVIRONMENT,it took 30 years for climate tech investments ...,it took 30 years for climate tech investments ...,"[it, took, 30, years, for, climate, tech, inve...","[took, 30, years, climate, tech, investments, ...","[took, 30, year, climate, tech, investment, pa..."
2,Rain when sitting in a car,RELAX,rain when sitting in a car,rain when sitting in a car,"[rain, when, sitting, in, a, car]","[rain, sitting, car]","[rain, sitting, car]"
3,Is Cassis worth staying in?,TRAVEL,is cassis worth staying in?,is cassis worth staying in,"[is, cassis, worth, staying, in]","[cassis, worth, staying]","[cassis, worth, staying]"
4,New Research Suggests Some Black Holes May Act...,ASTRONOMY,new research suggests some black holes may act...,new research suggests some black holes may act...,"[new, research, suggests, some, black, holes, ...","[new, research, suggests, black, holes, may, a...","[new, research, suggests, black, hole, may, ac..."


In [10]:
df.to_json('../../output_data/2_tc_nltk_preprocessed.json', orient='records',  indent=2)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25763 entries, 0 to 25762
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   title            25763 non-null  object
 1   tag              25763 non-null  object
 2   s1_title_lower   25763 non-null  object
 3   s2_clean_title   25763 non-null  object
 4   s3_tokenized     25763 non-null  object
 5   s4_no_stopwords  25763 non-null  object
 6   s5_lemmatized    25763 non-null  object
dtypes: object(7)
memory usage: 1.4+ MB
