In [20]:
# Import libraries
import json
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
with open('../data/w2/news_dataset.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

In [22]:
news_df = pd.DataFrame(data)

In [23]:
test_df = news_df[['id', 'content']].head(2)

In [24]:
# Bag of Words (Unigram)
# Using CountVectorizer
count_vectorizer_unigram = CountVectorizer(ngram_range=(1, 1))
bow_unigram = count_vectorizer_unigram.fit_transform(test_df['content'])

unigram_feature_names = count_vectorizer_unigram.get_feature_names_out()
unigram_bow_df = pd.DataFrame(bow_unigram.toarray(), columns=unigram_feature_names)

print("Bag of Words (Unigram) - testing df")
unigram_bow_df

Bag of Words (Unigram) - testing df


Unnamed: 0,024,0903,12h30,198,2014,2025,2372,263,29,2sao,...,địa,định,đọc,đốc,đối,đồng,độc,động,đủ,đức
0,1,1,1,1,1,0,1,1,1,2,...,1,0,1,2,7,3,1,2,0,1
1,0,0,0,0,0,2,0,0,0,0,...,0,1,0,0,1,0,0,2,1,0


In [25]:
# Bag of Words (Bigram)
# Ussing CountVectorizer, ngram_range=(2, 2)
count_vectorizer_bigram = CountVectorizer(ngram_range=(2, 2))
bow_bigram = count_vectorizer_bigram.fit_transform(test_df['content'])

bigram_feature_names = count_vectorizer_bigram.get_feature_names_out()
bigram_bow_df = pd.DataFrame(bow_bigram.toarray(), columns=bigram_feature_names)

print("Bag of Words (Bigram) - testing df")
bigram_bow_df

Bag of Words (Bigram) - testing df


Unnamed: 0,024 666,0903 263,12h30 ngày,198 email,2014 sđt,2025 viện,2025 điều,2372 gp,263 198,29 2014,...,đồng bộ,đồng chí,đồng hạ,độc giả,động muốn,động thế,động viên,động vì,đủ các,đức và
0,1,1,1,1,1,0,0,1,1,1,...,1,1,1,1,1,0,1,0,0,1
1,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,1,0,1,1,0


In [26]:
# TF-IDF (Unigram)
# Using TfidfVectorizer
tfidf_vectorizer_unigram = TfidfVectorizer(ngram_range=(1, 1))
tfidf_unigram = tfidf_vectorizer_unigram.fit_transform(test_df['content'])

unigram_tfidf_feature_names = tfidf_vectorizer_unigram.get_feature_names_out()
unigram_tfidf_df = pd.DataFrame(tfidf_unigram.toarray(), columns=unigram_tfidf_feature_names)

print("TF-IDF (Unigram) - testing df")
unigram_tfidf_df

TF-IDF (Unigram) - testing df


Unnamed: 0,024,0903,12h30,198,2014,2025,2372,263,29,2sao,...,địa,định,đọc,đốc,đối,đồng,độc,động,đủ,đức
0,0.026225,0.026225,0.026225,0.026225,0.026225,0.0,0.026225,0.026225,0.026225,0.05245,...,0.026225,0.0,0.026225,0.05245,0.130614,0.078675,0.026225,0.037318,0.0,0.026225
1,0.0,0.0,0.0,0.0,0.0,0.076517,0.0,0.0,0.0,0.0,...,0.0,0.038259,0.0,0.0,0.027221,0.0,0.0,0.054443,0.038259,0.0


In [27]:
# TF-IDF (Bigram)
# Using TfidfVectorizer, ngram_range=(2, 2)
tfidf_vectorizer_bigram = TfidfVectorizer(ngram_range=(2, 2))
tfidf_bigram = tfidf_vectorizer_bigram.fit_transform(test_df['content'])

bigram_tfidf_feature_names = tfidf_vectorizer_bigram.get_feature_names_out()
bigram_tfidf_df = pd.DataFrame(tfidf_bigram.toarray(), columns=bigram_tfidf_feature_names)

print("TF-IDF (Bigram) - testing df")
bigram_tfidf_df

TF-IDF (Bigram) - testing df


Unnamed: 0,024 666,0903 263,12h30 ngày,198 email,2014 sđt,2025 viện,2025 điều,2372 gp,263 198,29 2014,...,đồng bộ,đồng chí,đồng hạ,độc giả,động muốn,động thế,động viên,động vì,đủ các,đức và
0,0.033547,0.033547,0.033547,0.033547,0.033547,0.0,0.0,0.033547,0.033547,0.033547,...,0.033547,0.033547,0.033547,0.033547,0.033547,0.0,0.033547,0.0,0.0,0.033547
1,0.0,0.0,0.0,0.0,0.0,0.048706,0.048706,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.048706,0.0,0.048706,0.048706,0.0


In [28]:
# Apply for all data set
# Bag of Words (Unigram)
bow_unigram_full = count_vectorizer_unigram.transform(test_df['content'])
bow_unigram_full_df = pd.DataFrame(bow_unigram_full.toarray(), columns=unigram_feature_names)

# Bag of Words (Bigram)
bow_bigram_full = count_vectorizer_bigram.transform(test_df['content'])
bow_bigram_full_df = pd.DataFrame(bow_bigram_full.toarray(), columns=bigram_feature_names)

# TF-IDF (Unigram)
tfidf_unigram_full = tfidf_vectorizer_unigram.transform(test_df['content'])
tfidf_unigram_full_df = pd.DataFrame(tfidf_unigram_full.toarray(), columns=unigram_tfidf_feature_names)

# TF-IDF (Bigram)
tfidf_bigram_full = tfidf_vectorizer_bigram.transform(test_df['content'])
tfidf_bigram_full_df = pd.DataFrame(tfidf_bigram_full.toarray(), columns=bigram_tfidf_feature_names)

In [29]:
bow_unigram_full_df.head(3)

Unnamed: 0,024,0903,12h30,198,2014,2025,2372,263,29,2sao,...,địa,định,đọc,đốc,đối,đồng,độc,động,đủ,đức
0,1,1,1,1,1,0,1,1,1,2,...,1,0,1,2,7,3,1,2,0,1
1,0,0,0,0,0,2,0,0,0,0,...,0,1,0,0,1,0,0,2,1,0


In [30]:
bow_bigram_full_df.head(3)

Unnamed: 0,024 666,0903 263,12h30 ngày,198 email,2014 sđt,2025 viện,2025 điều,2372 gp,263 198,29 2014,...,đồng bộ,đồng chí,đồng hạ,độc giả,động muốn,động thế,động viên,động vì,đủ các,đức và
0,1,1,1,1,1,0,0,1,1,1,...,1,1,1,1,1,0,1,0,0,1
1,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,1,0,1,1,0


In [31]:
tfidf_unigram_full_df.head(3)

Unnamed: 0,024,0903,12h30,198,2014,2025,2372,263,29,2sao,...,địa,định,đọc,đốc,đối,đồng,độc,động,đủ,đức
0,0.026225,0.026225,0.026225,0.026225,0.026225,0.0,0.026225,0.026225,0.026225,0.05245,...,0.026225,0.0,0.026225,0.05245,0.130614,0.078675,0.026225,0.037318,0.0,0.026225
1,0.0,0.0,0.0,0.0,0.0,0.076517,0.0,0.0,0.0,0.0,...,0.0,0.038259,0.0,0.0,0.027221,0.0,0.0,0.054443,0.038259,0.0


In [32]:
tfidf_bigram_full_df.head(3)

Unnamed: 0,024 666,0903 263,12h30 ngày,198 email,2014 sđt,2025 viện,2025 điều,2372 gp,263 198,29 2014,...,đồng bộ,đồng chí,đồng hạ,độc giả,động muốn,động thế,động viên,động vì,đủ các,đức và
0,0.033547,0.033547,0.033547,0.033547,0.033547,0.0,0.0,0.033547,0.033547,0.033547,...,0.033547,0.033547,0.033547,0.033547,0.033547,0.0,0.033547,0.0,0.0,0.033547
1,0.0,0.0,0.0,0.0,0.0,0.048706,0.048706,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.048706,0.0,0.048706,0.048706,0.0


In [33]:
# # save to csv
# bow_unigram_full_df.to_csv('bow_unigram_full.csv', index=False)
# bow_bigram_full_df.to_csv('bow_bigram_full.csv', index=False)
# tfidf_unigram_full_df.to_csv('tfidf_unigram_full.csv', index=False)
# tfidf_bigram_full_df.to_csv('tfidf_bigram_full.csv', index=False)