In [86]:
#Pre - processing goals:

#1.Remove all company names showed in the article 
#2.Regular Expression/Normalization — lowercase the words, remove punctuation and remove numbers
#3.Stemming and lemmatization
#4.Remove stop words
#5.Tokenization
 


In [87]:
import pandas as pd
import numpy as np


In [88]:
#preprocessing libraries for nlp
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [89]:
# We will use this for identifying company names and their spans

import spacy
nlp = spacy.load('en_core_web_sm')

In [90]:
df = pd.read_csv('/content/drive/MyDrive/sharmadata.csv')
df.columns


Index(['News_content', 'Link', 'Title'], dtype='object')

In [91]:
#checking of our all article DataFrame contains any NUll values
df.isnull().sum()

News_content    0
Link            0
Title           0
dtype: int64

In [92]:
wordnet = WordNetLemmatizer()
ps      = PorterStemmer()

In [93]:
#Removing Company names using NER model in spacy

def Remove_company_names(article):
  doc  = nlp(article)
  aft_comp_remov_art = article # we will save this functions output in this variable 
  for ent in doc.ents:
    if ent.label_ == 'ORG' :
      aft_comp_remov_art = re.sub(ent.text, ' ', aft_comp_remov_art)

  return aft_comp_remov_art #Returning a string having company names removed


In [94]:
def All_other_pre_processing(article):
  corpus = []
  review = re.sub('[^a-zA-Z]', ' ', article) #Remove everything(Punctuations, Numbers, etc... except the alphabetical words)
  review = review.lower()                     #lowercasing the words
  review = review.split()
  review = [wordnet.lemmatize(word) for word in review if not word in stopwords.words('english')] #removing stopwords and lemmatizing
  review = [ps.stem(word) for word in review] #stemming
  review = ' '.join(review)
  corpus.append(review)

  return corpus[0] #Returning final preprocessed article string

In [95]:
def tokenizing(article):
#tokenization
#we can also use tokenizers library
  tokens = [word for word in article.split()]
  return tokens

In [97]:
un_processed_article = df['News_content'][0]

comp_names_remov_article = Remove_company_names(un_processed_article)

final_article = All_other_pre_processing(comp_names_remov_article)

final_article_tokens = tokenizing(final_article)

In [98]:
print(len(un_processed_article))
print(len(comp_names_remov_article))
print(len(final_article))

2217
2102
1322


In [99]:
un_processed_article

" China's visa and international travel restrictions have helped to keep Covid-19 in check yet become ... [+] a source of frustration for American companies in the country. (Photo by NOEL CELIS/AFP via Getty Images) A survey by the American Chamber of Commerce in China released Friday found that nearly two-thirds of respondents want the U.S. and the Chinese governments to restore visa services and travel channels for business executives and their dependents as their top priority this year.  “As the pandemic continues well into its second year, the priorities for our member companies remain consistent, with the resumption of business travel at the top of the list,” said AmCham China Chairman Greg Gilligan in a statement.  Other top priorities for member companies this year include regularized government-to-government\xa0communication, and the removal of bilateral tariffs, the flash survey found.  “The state of the overall U.S.-China relationship is as important as ever to the business c

In [100]:
final_article

'china visa intern travel restrict help keep covid check yet becom sourc frustrat american compani countri photo noel celi afp via survey china releas friday found nearli two third respond want u chines govern restor visa servic travel channel busi execut depend top prioriti year pandem continu well second year prioriti member compani remain consist resumpt busi travel top list said amcham china chairman greg gilligan statement top prioriti member compani year includ regular govern govern commun remov bilater tariff flash survey found state overal u china relationship import ever busi commun member priorit concret result action way restor trust two countri gilligan said survey beij base conduct august better understand member compani perceiv prioriti u china relat phase one agreement implement current situat respect bilater tariff compani total nearli member submit respons organ say compani particip survey board member includ includ execut top issu includ remov bilater tariff restart p

In [None]:
final_article_tokens

In [None]:
#confirming if there is a number in our article!?
pattern = re.compile(r'\d+')
matches = pattern.finditer(final_article)
for match in matches:
  print(match)

In [102]:
#Checking if we had company names(Yes! We had..)
doc  = nlp(un_processed_article )  
for ent in doc.ents:
  if ent.label_ == 'ORG' :
    print(ent.text)

Getty Images
the American Chamber of Commerce
Amcham China
Amcham China board
Intel
Boeing
Microsoft
Goldman Sachs
the White House
Amcham
IMAX


In [104]:
#Checking if we removed company names(Yes! We did..)
doc  = nlp(comp_names_remov_article )  
for ent in doc.ents:
  if ent.label_ == 'ORG' :
    print(ent.text)

In [None]:
#Method 2 to remove company names without using re
#Here We are using Named entity recognition function of spacy

def Method2_comp_name_remov(article):
  doc  = nlp(article)
  start = 0
  aft_remov_comp = '' #Here we will store the final article
  for ent in doc.ents:
    if ent.label_ == 'ORG' :
      aft_remov_comp = aft_remov_comp + article[start:ent.start_char]  
      start = ent.end_char+1

  aft_remov_comp = aft_remov_comp + text[start:len(text)]

  return aft_remov_comp
