In [51]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


## Text Preprocessing 

* Lower casing
* Removal of Punctuations
* Removal of Stopwords
* Removal of Frequent words
* Removal of Rare words
* Stemming
* Lemmatization
* Removal of emojis
* Removal of emoticons
* Conversion of emoticons to words
* Conversion of emojis to words
* Removal of URLs
* Removal of HTML tags
* Chat words conversion
* Spelling correction


These are common text preprocessing typically done text dataset, We will also apply some of the applicable preprocessing

In [55]:
## Lets read positive samples 

df = pd.read_csv("./input/positive-combined.csv")
df.head()

Unnamed: 0,article,y
0,"Peripheral neuropathy - Wikipedia, the free en...",1
1,"Colorado tick fever - Wikipedia, the free ency...",1
2,"Rosselli–Gulienetti syndrome - Wikipedia, the ...",1
3,"Johanson–Blizzard syndrome - Wikipedia, the fr...",1
4,"DiGeorge syndrome - Wikipedia, the free encycl...",1


In [54]:
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

def remove_punc(x):
    # Replace _ or - with white space  First
    x = re.sub("[-–_−―—]+", " ", x )
    for punct in puncts:
        x = x.replace(punct, '')
    return x

STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])


stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

In [56]:
df["article"] = df["article"].astype(str)
df["article"] = df["article"].str.lower()

In [57]:
# removing stop words 
df['article'] = df['article'].apply(lambda article: remove_stopwords(article))

In [58]:
df["article"] = df["article"].str.replace('\d+', '')
df["article"] = df["article"].apply(lambda article: remove_punc(article))
df.head()

Unnamed: 0,article,y
0,peripheral neuropathy wikipedia free encyclo...,1
1,colorado tick fever wikipedia free encyclope...,1
2,rosselli gulienetti syndrome wikipedia free ...,1
3,johanson blizzard syndrome wikipedia free en...,1
4,digeorge syndrome wikipedia free encyclopedi...,1


In [59]:
df['article'] = df['article'].apply(lambda article: stem_words(article))

In [85]:
df.head()

Unnamed: 0,article,y
0,peripher neuropathi wikipedia free encyclopedi...,1
1,colorado tick fever wikipedia free encyclopedi...,1
2,rosselli gulienetti syndrom wikipedia free enc...,1
3,johanson blizzard syndrom wikipedia free encyc...,1
4,digeorg syndrom wikipedia free encyclopedia di...,1


In [86]:
# Let save the intermediate results
df.to_csv("./input/positive-cleaned.csv", index=False, header=True)

In [66]:
# lets compile this steps as function for future usecases

def clean_text_data(df,column):
    df["article"] = df["article"].astype(str)
    df["article"] = df["article"].str.lower()
    # removing stop words 
    df['article'] = df['article'].apply(lambda article: remove_stopwords(article))
    # Remove Digits
    df["article"] = df["article"].str.replace('\d+', '')
    # Remove Punctuations Note: It replace _ - with white place
    df["article"] = df["article"].apply(lambda article: remove_punc(article))
    # Stem words
    df['article'] = df['article'].apply(lambda article: stem_words(article))
    return df

In [78]:
negative_df = pd.read_csv('./input/negative-combined.csv')
negative_df.head()

Unnamed: 0,article,y
0,"Vihti - Wikipedia, the free encyclopedia Vihti...",0
1,"Javad Yasari - Wikipedia, the free encyclopedi...",0
2,"Hey, Look Me Over (M*A*S*H episode) - Wikipedi...",0
3,"Ayreon - Wikipedia, the free encyclopedia Ayre...",0
4,"Hifikepunye Pohamba - Wikipedia, the free ency...",0


In [79]:
negative_df = clean_text_data(negative_df, 'article')

In [80]:
negative_df.head()

Unnamed: 0,article,y
0,vihti wikipedia free encyclopedia vihti finnis...,0
1,javad yasari wikipedia free encyclopedia javad...,0
2,hey look mash episod wikipedia free encycloped...,0
3,ayreon wikipedia free encyclopedia ayreon iˈɛr...,0
4,hifikepuny pohamba wikipedia free encyclopedia...,0


In [81]:
negative_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
article    10000 non-null object
y          10000 non-null int64
dtypes: int64(1), object(1)
memory usage: 156.4+ KB


In [82]:
negative_df.to_csv("./input/negative-cleaned.csv", index=False, header=True)

In [83]:
negative_df_cleaned = pd.read_csv('./input/negative-cleaned.csv')
negative_df_cleaned.head()

Unnamed: 0,article,y
0,vihti wikipedia free encyclopedia vihti finnis...,0
1,javad yasari wikipedia free encyclopedia javad...,0
2,hey look mash episod wikipedia free encycloped...,0
3,ayreon wikipedia free encyclopedia ayreon iˈɛr...,0
4,hifikepuny pohamba wikipedia free encyclopedia...,0


In [87]:
positive_df_cleaned = pd.read_csv('./input/positive-cleaned.csv')
positive_df_cleaned.head()

Unnamed: 0,article,y
0,peripher neuropathi wikipedia free encyclopedi...,1
1,colorado tick fever wikipedia free encyclopedi...,1
2,rosselli gulienetti syndrom wikipedia free enc...,1
3,johanson blizzard syndrom wikipedia free encyc...,1
4,digeorg syndrom wikipedia free encyclopedia di...,1


In [88]:
combined_df = pd.concat([positive_df_cleaned, negative_df_cleaned])
combined_df.head()

Unnamed: 0,article,y
0,peripher neuropathi wikipedia free encyclopedi...,1
1,colorado tick fever wikipedia free encyclopedi...,1
2,rosselli gulienetti syndrom wikipedia free enc...,1
3,johanson blizzard syndrom wikipedia free encyc...,1
4,digeorg syndrom wikipedia free encyclopedia di...,1


In [90]:
combined_df.to_csv("./input/dataset.csv", index=False, header=True)