<a href="https://colab.research.google.com/github/rosangelahs/national_news1/blob/main/Data_cleaning_and_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
from google.colab import drive
drive.mount('/content/gdrive')
import pandas as pd
import numpy as np
import re
import unicodedata
import string
import spacy
import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')


Mounted at /content/gdrive


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

[Errno 107] Transport endpoint is not connected: 'gdrive/My-Drive/'
/content/drive/MyDrive


In [None]:
data = pd.read_csv('drive/MyDrive/nn_data/scraped_data.csv', sep=';', engine='python', error_bad_lines=False)
nn_df = pd.DataFrame(data)
nn_df.columns=[ 'index', 'title', 'new', 'summary'] 

In [None]:
# Rebuilding missing data
nn_df.isna().any().sum() 
nn_df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)

In [None]:
# Remove duplicated data
nn_df.duplicated().sum()

#Remove none values
nn_df = nn_df.replace(to_replace='None', value=np.nan).dropna()

**Data Preprocessing**

In [None]:
#Remove accented characters
def remove_accented_chars(text):
    new_text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return new_text


nn_df['new']=nn_df['new'].apply(remove_accented_chars)
nn_df['summary']=nn_df['summary'].apply(remove_accented_chars)
nn_df['title']=nn_df['title'].apply(remove_accented_chars)

In [None]:
# Remove punctuation
def remove_punctuation(text):
  new_text = re.sub(r'[^\w\s]','',text)
  return new_text


nn_df['new']=nn_df['new'].apply(remove_punctuation)
nn_df['summary']=nn_df['summary'].apply(remove_punctuation)
nn_df['title']=nn_df['title'].apply(remove_punctuation)

In [None]:
#Remove unicode characters
def remove_unicode_characters(text):
  new_text = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text)
  return new_text


nn_df['new']=nn_df['new'].apply(remove_unicode_characters)
nn_df['summary']=nn_df['summary'].apply(remove_unicode_characters)
nn_df['title']=nn_df['title'].apply(remove_unicode_characters)  

In [None]:
#Remove numbers
def remove_numbers(text):
    # define the pattern to keep
    pattern = r'[^a-zA-z.,!?/:;\"\'\s]' 
    return re.sub(pattern, '', text)


nn_df['new']=nn_df['new'].apply(remove_numbers)
nn_df['summary']=nn_df['summary'].apply(remove_numbers)
nn_df['title']=nn_df['title'].apply(remove_numbers) 

KeyboardInterrupt: ignored

In [None]:
# Normalize text(lowercasing)
nn_df['new']=nn_df['new'].str.lower()
nn_df['summary']=nn_df['summary'].str.lower()
nn_df['title'] = nn_df['title'].str.lower()

In [None]:
# Remove extra white spaces
def remove_whitespace(text):
    return  " ".join(text.split())


nn_df['new']=nn_df['new'].apply(remove_whitespace)
nn_df['summary']=nn_df['summary'].apply(remove_whitespace)
nn_df['title']=nn_df['title'].apply(remove_whitespace)

In [None]:
#Remove stopwords
stop = stopwords.words('spanish')
nn_df['title'] = nn_df['title'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
nn_df['new'] = nn_df['new'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
nn_df['summary'] = nn_df['summary'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))


In [None]:
nn_df.shape

(89751, 4)

In [None]:
# Lemmatization
!python -m spacy download es_core_news_sm
nlp = spacy.load('es_core_news_sm')
nn_df["title"] = nn_df['title'].apply(lambda x: " ".join([y.lemma_ for y in nlp(x)]))
nn_df["new"] = nn_df['new'].apply(lambda x: " ".join([y.lemma_ for y in nlp(x)]))
nn_df["summary"] = nn_df['summary'].apply(lambda x: " ".join([y.lemma_ for y in nlp(x)]))


In [None]:
#Tokenization
nn_df['tokenized_news'] = nn_df.apply(lambda row: nltk.word_tokenize(row['new']), axis=1)
nn_df['tokenized_summaries'] = nn_df.apply(lambda row: nltk.word_tokenize(row['summary']), axis=1)

In [None]:
from google.colab import files
nn_df.to_csv('cleaned_without_lemmatize.csv', index=False) 
files.download('cleaned_without_lemmatize.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
nn_df

Unnamed: 0,index,title,new,summary
0,1205,11 julio analisis necesario marxlenin perez va...,marxlenin perez valdes graduada facultad filos...,marxlenin perez valdes graduada facultad filos...
1,1206,conciencia clase perspectiva fidelista video,posible hacer revolucion socialista pais cuba ...,posible hacer revolucion socialista pais cuba ...
2,1207,filo resaca video,tiempos corren ser comunista practicamente sin...,tiempos corren ser comunista practicamente sin...
3,1208,recibio marrero cruz vicepresidenta ejecutiva ...,miembro buro politico primer ministro cuba man...,miembro buro politico primer ministro manuel m...
4,1209,diazcanel encabeza jornada final visita integr...,pais detenido pais quieren detener detenidos v...,primer secretario comite central partido comun...
...,...,...,...,...
89916,90836,quedan dos dias aparece cuarto pasajero postem...,sancti spiritus alcanzo importante triunfo lid...,sancti spiritus alcanzo importante triunfo lid...
89917,90837,intenso programa trabajo comisiones asamblea n...,lunes comisiones permanentes trabajo asamblea ...,lunes comisiones permanentes trabajo asamblea ...
89918,90838,legisladores peloteros prensa unidos celebran ...,senador norteamericano patrick leahy celebro a...,senador norteamericano patrick leahy celebro a...
89919,90839,anuncian premios 26 julio periodismo 2017,premiados concurso 26 julio periodismo edicion...,premiados concurso 26 julio periodismo edicion...
