In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


In [2]:
# Download required NLTK data
# -------------------------------
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abdul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abdul\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Load raw dataset
# -------------------------------
df = pd.read_csv("Data/sample_training_data.csv")
df.head()


Unnamed: 0,text,label
0,Wall St. Bears Claw Back Into the Black (Reute...,2
1,Carlyle Looks Toward Commercial Aerospace (Reu...,2
2,Oil and Economy Cloud Stocks' Outlook (Reuters...,2
3,Iraq Halts Oil Exports from Main Southern Pipe...,2
4,"Oil prices soar to all-time record, posing new...",2


In [4]:
# Convert text to lowercase
# -------------------------------
df['clean_text'] = df['text'].str.lower()


In [5]:
# Remove punctuation & numbers
# -------------------------------
df['clean_text'] = df['clean_text'].apply(lambda x: re.sub('[^a-z ]',' ', x))


In [6]:
# Remove extra spaces
# -------------------------------
df['clean_text'] = df['clean_text'].str.strip()


In [None]:
# Remove stopwords (is, the, and, etc)
# -------------------------------
stop = set(stopwords.words('english'))

df['clean_text'] = df['clean_text'].apply(
    lambda x: " ".join([word for word in x.split() if word not in stop])
)


In [None]:
# Lemmatization 
"""(it reduce the words e.g; Running, runs, ran consider 
the run all of this words)"""

lemm = WordNetLemmatizer()

df['clean_text'] = df['clean_text'].apply(
    lambda x: " ".join([lemm.lemmatize(word) for word in x.split()])
)


In [9]:
df.head(10)

Unnamed: 0,text,label,clean_text
0,Wall St. Bears Claw Back Into the Black (Reute...,2,wall st bear claw back black reuters reuters s...
1,Carlyle Looks Toward Commercial Aerospace (Reu...,2,carlyle look toward commercial aerospace reute...
2,Oil and Economy Cloud Stocks' Outlook (Reuters...,2,oil economy cloud stock outlook reuters reuter...
3,Iraq Halts Oil Exports from Main Southern Pipe...,2,iraq halt oil export main southern pipeline re...
4,"Oil prices soar to all-time record, posing new...",2,oil price soar time record posing new menace u...
5,"Stocks End Up, But Near Year Lows (Reuters) Re...",2,stock end near year low reuters reuters stock ...
6,Money Funds Fell in Latest Week (AP) AP - Asse...,2,money fund fell latest week ap ap asset nation...
7,Fed minutes show dissent over inflation (USATO...,2,fed minute show dissent inflation usatoday com...
8,Safety Net (Forbes.com) Forbes.com - After ear...,2,safety net forbes com forbes com earning ph so...
9,Wall St. Bears Claw Back Into the Black NEW Y...,2,wall st bear claw back black new york reuters ...


In [10]:
# Save cleaned file
# -------------------------------
df[['clean_text','label']].to_csv("Data/clean_data.csv", index=False)
