<a href="https://colab.research.google.com/github/rosangelahs/national_news1/blob/main/Data_cleaning_and_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import numpy as np
import re
import unicodedata
import string
import spacy
import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')


In [None]:
data = pd.read_csv('drive/MyDrive/nn_data/scraped_data.csv', sep=';', engine='python', error_bad_lines=False)
nn_df = pd.DataFrame(data)
nn_df.columns=['index', 'title', 'new', 'summary'] 

In [None]:
# Rebuilding missing data
nn_df.isna().any().sum() 
nn_df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)

In [4]:
# Remove duplicated data
nn_df.duplicated().sum()

#Remove none values
nn_df = nn_df.replace(to_replace='None', value=np.nan).dropna()

**Data Preprocessing**

In [7]:
#Remove accented characters
def remove_accented_chars(text):
    new_text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return new_text


nn_df['new']=nn_df['new'].apply(remove_accented_chars)
nn_df['summary']=nn_df['summary'].apply(remove_accented_chars)
nn_df['title']=nn_df['title'].apply(remove_accented_chars)

In [8]:
# Remove punctuation
def remove_punctuation(text):
  new_text = re.sub(r'[^\w\s]','',text)
  return new_text


nn_df['new']=nn_df['new'].apply(remove_punctuation)
nn_df['summary']=nn_df['summary'].apply(remove_punctuation)
nn_df['title']=nn_df['title'].apply(remove_punctuation)

In [9]:
#Remove unicode characters
def remove_unicode_characters(text):
  new_text = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text)
  return new_text


nn_df['new']=nn_df['new'].apply(remove_unicode_characters)
nn_df['summary']=nn_df['summary'].apply(remove_unicode_characters)
nn_df['title']=nn_df['title'].apply(remove_unicode_characters)  

In [10]:
#Remove numbers
def remove_numbers(text):
    # define the pattern to keep
    pattern = r'[^a-zA-z.,!?/:;\"\'\s]' 
    return re.sub(pattern, '', text)


nn_df['new']=nn_df['new'].apply(remove_numbers)
nn_df['summary']=nn_df['summary'].apply(remove_numbers)
nn_df['title']=nn_df['title'].apply(remove_numbers) 

In [11]:
# Normalize text(lowercasing)
nn_df['new']=nn_df['new'].str.lower()
nn_df['summary']=nn_df['summary'].str.lower()
nn_df['title'] = nn_df['title'].str.lower()

In [12]:
# Remove extra white spaces
def remove_whitespace(text):
    return  " ".join(text.split())


nn_df['new']=nn_df['new'].apply(remove_whitespace)
nn_df['summary']=nn_df['summary'].apply(remove_whitespace)
nn_df['title']=nn_df['title'].apply(remove_whitespace)

In [13]:
#Remove stopwords
stop = stopwords.words('spanish')
nn_df['title'] = nn_df['title'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
nn_df['new'] = nn_df['new'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
nn_df['summary'] = nn_df['summary'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [None]:
# Lemmatization
!python -m spacy download es_core_news_sm
nlp = spacy.load('es_core_news_sm')
nn_df["title"] = nn_df['title'].apply(lambda x: " ".join([y.lemma_ for y in nlp(x)]))
nn_df["new"] = nn_df['new'].apply(lambda x: " ".join([y.lemma_ for y in nlp(x)]))
nn_df["summary"] = nn_df['summary'].apply(lambda x: " ".join([y.lemma_ for y in nlp(x)]))


In [17]:
#Tokenization
nn_df['tokenized_news'] = nn_df.apply(lambda row: nltk.word_tokenize(row['new']), axis=1)
nn_df['tokenized_summaries'] = nn_df.apply(lambda row: nltk.word_tokenize(row['summary']), axis=1)

In [20]:
from google.colab import files
nn_df.to_csv('cleaned_data_with_lemmatization.csv') 
files.download('cleaned_data_with_lemmatization.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [156]:
from google.colab import files
nn_df.to_csv('filename.csv') 
files.download('filename.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>