# News API Data Cleaning in Python

In [1]:
import pandas as pd # import necessary packages
import nltk
import json
import ast
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [2]:
news_df = pd.read_csv("../../data/raw-data/Newsapi_raw.csv", index_col=[0])

In [5]:
df = pd.DataFrame(news_df) # create df copy to avoid losing original data

df.isna().any() # check if any values are NA

source         False
author          True
title          False
description     True
url            False
urlToImage     False
publishedAt    False
content        False
dtype: bool

In [22]:
df.shape # look at shape of data frame

(456, 8)

In [23]:
df.head(10) # look at first 10 rows

Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content
0,Reuters,,Global recession can be avoided with right fis...,Global recession can be avoided if governments...,https://www.reuters.com/markets/asia/global-re...,https://www.reuters.com/resizer/8W8fki9z-vL-Y0...,2022-10-03T19:22:00Z,"RIYADH, Oct 3 (Reuters) - Global recession can..."
1,Reuters,,Oil extends losses on recession fears - Reuters,"Oil prices dipped in early trade on Friday, ex...",https://www.reuters.com/business/energy/oil-ex...,https://www.reuters.com/resizer/NTtAASgfx7F4Ph...,2022-09-16T01:21:00Z,"MELBOURNE, Sept 16 (Reuters) - Oil prices dipp..."
2,Reuters,Reuters Editorial,"BoE raises rates, despite likely recession - R...",The Bank of England raised its key interest ra...,https://www.reuters.com/video/watch/idOV725522...,https://static.reuters.com/resources/r/?d=2022...,2022-09-22T15:16:36Z,Posted \r\nThe Bank of England raised its key ...
3,Reuters,,IMF sees further slowdown in global economy in...,Downside risks continue to dominate the global...,https://www.reuters.com/markets/rates-bonds/im...,https://www.reuters.com/resizer/mgUzrgQRgDb4lp...,2022-09-15T14:52:00Z,"WASHINGTON, Sept 15 (Reuters) - Downside risks..."
4,Reuters,,Sterling dented by modest BoE rate hike - Reuters,Sterling surrendered gains against the dollar ...,https://www.reuters.com/markets/europe/sterlin...,https://www.reuters.com/resizer/HrhaG8Jv61uAHo...,2022-09-22T11:34:00Z,"LONDON, Sept 22 (Reuters) - Sterling surrender..."
5,Reuters,,German economy to slide into recession - EconM...,The German government expects Europe's largest...,https://www.reuters.com/markets/europe/german-...,https://www.reuters.com/resizer/oU9m0CHehHHcYd...,2022-10-12T11:35:00Z,"BERLIN, Oct 12 (Reuters) - The German governme..."
6,Reuters,,Column: Copper price slump brings Chinese buye...,While the rest of the world worries about rece...,https://www.reuters.com/markets/commodities/co...,https://www.reuters.com/pf/resources/images/re...,2022-09-27T14:11:00Z,"LONDON, Sept 27 (Reuters) - While the rest of ..."
7,Reuters,,"Futures fall on FedEx warning, recession fears...","U.S. stock index futures slipped on Friday, ex...",https://www.reuters.com/markets/europe/futures...,https://www.reuters.com/resizer/a2VymbMJAsHbCg...,2022-09-16T09:11:00Z,Sept 16 (Reuters) - U.S. stock index futures s...
8,Reuters,,SNB's Jordan can't rule out recession - Blick ...,The Swiss National Bank does not think the Swi...,https://www.reuters.com/markets/europe/snbs-jo...,https://www.reuters.com/resizer/9lqTA_Td3SH_H-...,2022-09-23T05:29:00Z,"ZURICH, Sept 23 (Reuters) - The Swiss National..."
9,Reuters,,Abu Dhabi leads Gulf declines on recession fea...,Abu Dhabi's stock market fell in early trade o...,https://www.reuters.com/markets/asia/abu-dhabi...,https://www.reuters.com/resizer/MyQoXqb9dWpL6i...,2022-09-26T08:48:00Z,Sept 26 (Reuters) - Abu Dhabi's stock market f...


In [24]:
import re
news_df['source'] = [re.sub("[^:]*,[^:]*", "", x) for x in news_df['source']] # remove unnecessary characters
news_df['source'] = [re.sub("'id'::", "", x) for x in news_df['source']]
news_df['source'] = [re.sub("\\{|\\}", "", x) for x in news_df['source']]
news_df['source'] = [re.sub("'", "", x) for x in news_df['source']]

In [9]:
news_df['source'].value_counts()

 Reuters             318
 Business Insider    136
 BBC News              2
Name: source, dtype: int64

Since we see that the author column has NA values, we can check if we can replace it with relevant information or simply drop the rows

In [10]:
is_NaN = df.isnull()
row_has_NaN = is_NaN.any(axis=1)
rows_with_NaN = df[row_has_NaN]

### Cleaning Content Column

In [11]:
cleaned=[]
for i in news_df['content']:
  if (i.find('...')!=-1) or (i.find('....')!=-1) or (i.find("..")!=-1):
    new=i.replace("..."," ").replace("...."," ").replace(".."," ")
    cleaned.append(new.strip())
  else:
    cleaned.append(i.strip())

In [12]:
import string

string.punctuation

def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text) #removes numbers from text
    return text


news_df['clean_content']=news_df['content'].apply(lambda x: remove_punct(x))

In [13]:
def tokenization(text):
    text = re.split('\W+', text) #splitting each sentence/ tweet into its individual words
    return text

news_df['content_tokenized'] = news_df['clean_content'].apply(lambda x: tokenization(x.lower()))

In [14]:
stopwords = nltk.corpus.stopwords.words('english')

def remove_stopwords(text):
    text = [word for word in text if word not in stopwords] # remove stopwords
    return text
    
news_df['content_nonstop'] = news_df['content_tokenized'].apply(lambda x: remove_stopwords(x))

In [17]:
wordnet = nltk.WordNetLemmatizer()

def lemmatizer(text):
    text = [wordnet.lemmatize(word) for word in text]
    return text

news_df['content_lemmatized'] = news_df['content_nonstop'].apply(lambda x: lemmatizer(x))

In [18]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

lemma=[]
for i in news_df['content_lemmatized']:
  s = " ".join(x for x in i)
  n=s.replace("[","").replace("]","").replace(","," ").replace("'", "")
  lemma.append(n) 

news_df['Content_Lemmatized_Sentiment_Analysis']=lemma

analyzer = SentimentIntensityAnalyzer()
news_df['sentiment_rating'] = news_df['Content_Lemmatized_Sentiment_Analysis'].apply(analyzer.polarity_scores)

def just_compound(x):
    return x['compound']

news_df['sentiment_rating'] = news_df['sentiment_rating'].apply(lambda x: just_compound(x))


In [19]:
news_df_nb = news_df[['Content_Lemmatized_Sentiment_Analysis', 'sentiment_rating']]

In [20]:
news_df_nb['label'] = news_df_nb['sentiment_rating'].apply(lambda x: "Negative" if x <0 else "Positive")

In [21]:
news_df_nb.head(10)

Unnamed: 0,Content_Lemmatized_Sentiment_Analysis,sentiment_rating,label
0,riyadh oct reuters global recession avoided go...,-0.8225,Negative
1,melbourne sept reuters oil price dipped early ...,-0.6249,Negative
2,posted bank england raised key interest rate t...,0.4588,Positive
3,washington sept reuters downside risk continue...,-0.7506,Negative
4,london sept reuters sterling surrendered gain ...,0.7506,Positive
5,berlin oct reuters german government expects e...,-0.7003,Negative
6,london sept reuters rest world worry recession...,-0.6908,Negative
7,sept reuters u stock index future slipped frid...,-0.6369,Negative
8,zurich sept reuters swiss national bank think ...,-0.5702,Negative
9,sept reuters abu dhabis stock market fell earl...,-0.1531,Negative


In [25]:
news_df_nb.to_csv("../../data/clean-data/news_clean.csv")