In [1]:
import zipfile
from google.colab import drive

drive.mount('/content/drive/')

# zip_ref = zipfile.ZipFile("/content/drive/My Drive/dataset-updated.zip", 'r')
# zip_ref.extractall("/tmp")
# zip_ref.close()

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
## for data
import pandas as pd  #(0.25.1)
import numpy  #(1.16.4)
## for plotting
import matplotlib.pyplot as plt  #(3.1.2)
import seaborn as sns  #(0.9.0)
## for preprocessing
import re
import nltk  #(3.4.5)
import contractions  #(0.0.18)
## for textrank
import gensim  #(3.8.1)
## for evaluation
import rouge  #(1.0.0)
import difflib
## for seq2seq
from tensorflow.keras import callbacks, models, layers, preprocessing as kprocessing #(2.6.0)
## for bart
import transformers  #(3.0.1)
import os
import glob
from tqdm import tqdm

In [None]:
dataset_path = "/content/drive/MyDrive/dataset-updated"

In [None]:
categories = os.listdir(dataset_path)
#categories

In [None]:

print("Reading the data")
wiki_data = { "Title": [] , "Article" : [] , "Category" : []  , "Summary" : []}
for category in tqdm(categories):
    article_path = dataset_path+"/"+category+"/Article/"
    summary_path = dataset_path+"/"+category+"/Summary/"

    for file in glob.glob(article_path+"/*.txt"):
        file_name = os.path.basename(file)
        title = file_name.replace(".txt","")

        with open(file,'r') as f:        
            wiki_data["Article"].append(f.read())
            wiki_data["Category"].append(category)

        with open(summary_path+file_name,'r') as f:        
            wiki_data["Summary"].append(f.read())
            wiki_data["Title"].append(title)


Reading the data


100%|██████████| 9/9 [25:12<00:00, 168.04s/it]


In [None]:
print("Creating the dataframe")
wikiDF = pd.DataFrame( data = wiki_data )
wikiDF.reset_index(drop=True, inplace=True)

Creating the dataframe


In [None]:
print("Saving the data in CSV")
wikiDF.to_csv("Wikipedia-Level-4-Articles.csv")

Saving the data in CSV


In [3]:
data = pd.read_csv('/content/drive/MyDrive/Wikipedia-Level-4-Articles.csv')

In [12]:
data.head(5)

Unnamed: 0.1,Unnamed: 0,Title,Article,Category,Summary,CleanedText
0,0,Ecumenism,"Ecumenism (), also spelled oecumenism, is the...",Philosophy_and_religion,"Ecumenism (), also spelled oecumenism, is the...",ecumen also spell oecumen concept principl chr...
1,1,Eastern Christianity,Eastern Christianity comprises Christian tradi...,Philosophy_and_religion,Eastern Christianity comprises Christian tradi...,eastern christian compris christian tradit chu...
2,2,Eastern Orthodox Church,"The Eastern Orthodox Church, also called the O...",Philosophy_and_religion,"The Eastern Orthodox Church, also called the O...",eastern orthodox church also call orthodox chu...
3,3,Eastern Catholic Churches,The Eastern Catholic Churches or Oriental Cath...,Philosophy_and_religion,The Eastern Catholic Churches or Oriental Cath...,eastern cathol church orient cathol church als...
4,4,Druze,"The Druze (; Arabic: دَرْزِيٌّ, darzī or Arabi...",Philosophy_and_religion,"The Druze (; Arabic: دَرْزِيٌّ, darzī or Arabi...",druze arab درزي darzī arab درزي durzī pl دروز ...


In [5]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
## create stopwords
lst_stopwords = nltk.corpus.stopwords.words("english")
## add words that are too frequent
lst_stopwords = lst_stopwords + ["cnn","say","said","new"]

## cleaning function
def utils_preprocess_text(txt):
    punkt=True
    lower=True
    slang=True
    lst_stopword=True
    stemm=True
    lemm=True
    ### separate sentences with '. '
    txt = re.sub(r'\.(?=[^ \W\d])', '. ', str(txt))
    ### remove punctuations and characters
    txt = re.sub(r'[^\w\s]', '', txt) if punkt is True else txt
    ### strip
    txt = " ".join([word.strip() for word in txt.split()])
    ### lowercase
    txt = txt.lower() if lower is True else txt
    ### slang
    txt = contractions.fix(txt) if slang is True else txt   
    ### tokenize (convert from string to list)
    lst_txt = txt.split()
    ### stemming (remove -ing, -ly, ...)
    if stemm is True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_txt = [ps.stem(word) for word in lst_txt]
    ### lemmatization (convert the word into root word)
    if lemm is True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_txt = [lem.lemmatize(word) for word in lst_txt]
    ### remove Stopwords
    if lst_stopwords is not None:
        lst_txt = [word for word in lst_txt if word not in 
                   lst_stopwords]
    ### back to string
    txt = " ".join(lst_txt)
    return txt

In [7]:
text = data.iloc[0]["Article"]

In [8]:

print("Cleaning the data")
data["CleanedText"] = tqdm(data["Article"].apply(utils_preprocess_text))


Cleaning the data


100%|██████████| 7869/7869 [00:00<00:00, 708839.36it/s]


In [9]:
print("Saving the cleaned data in CSV")
data.to_csv("Wikipedia-Level-4-Articles-Cleaned.csv")

In [10]:
cleaned_data = pd.read_csv('/content/Wikipedia-Level-4-Articles-Cleaned.csv')

In [13]:
cleaned_data.drop(['Unnamed: 0.1', 'Unnamed: 0'], axis=1)

Unnamed: 0,Title,Article,Category,Summary,CleanedText
0,Ecumenism,"Ecumenism (), also spelled oecumenism, is the...",Philosophy_and_religion,"Ecumenism (), also spelled oecumenism, is the...",ecumen also spell oecumen concept principl chr...
1,Eastern Christianity,Eastern Christianity comprises Christian tradi...,Philosophy_and_religion,Eastern Christianity comprises Christian tradi...,eastern christian compris christian tradit chu...
2,Eastern Orthodox Church,"The Eastern Orthodox Church, also called the O...",Philosophy_and_religion,"The Eastern Orthodox Church, also called the O...",eastern orthodox church also call orthodox chu...
3,Eastern Catholic Churches,The Eastern Catholic Churches or Oriental Cath...,Philosophy_and_religion,The Eastern Catholic Churches or Oriental Cath...,eastern cathol church orient cathol church als...
4,Druze,"The Druze (; Arabic: دَرْزِيٌّ, darzī or Arabi...",Philosophy_and_religion,"The Druze (; Arabic: دَرْزِيٌّ, darzī or Arabi...",druze arab درزي darzī arab درزي durzī pl دروز ...
...,...,...,...,...,...
7864,RomanΓÇôPersian Wars,"The Roman–Persian Wars, also known as the Roma...",History,"The Roman–Persian Wars, also known as the Roma...",romanpersian war also known romaniranian war s...
7865,Sabaeans,"The Sabaeans or Sabeans (Sabaean: 𐩪𐩨𐩱, S¹Bʾ; A...",History,"The Sabaeans or Sabeans (Sabaean: 𐩪𐩨𐩱, S¹Bʾ; A...",sabaean sabean sabaean 𐩪𐩨𐩱 s¹bʾ arab ٱلسبئيون ...
7866,Russian Civil War,The Russian Civil War (Russian: Гражданская во...,History,The Russian Civil War (Russian: Гражданская во...,russian civil war russian гражданская война в ...
7867,Russian Revolution,The Russian Revolution was a period of politic...,History,The Russian Revolution was a period of politic...,russian revolut wa period polit social revolut...


In [14]:
print("Saving the cleaned data in CSV")
cleaned_data.to_csv("Cleaned_Data.csv")

Saving the cleaned data in CSV
