In [34]:
import numpy as np
import pandas as pa
import matplotlib.pyplot as plt
import seaborn as sn
import re
import nltk
from nltk.tokenize import word_tokenize,TweetTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
import os
from nltk.corpus import stopwords
import string
import warnings
warnings.filterwarnings('ignore')

#### Load raw data 

In [35]:
#Load the data
filedir = os.path.abspath(r"C:\Users\BABI\Dynamic Blog Recommendation\Sample Data")
medium_filename = "medium_final.csv"
toward_filename = "towards_data_science.csv"
analytics_filename= "analytics_vidya.csv"
filepath_medium = os.path.join(filedir, medium_filename)
filepath_toward = os.path.join(filedir, toward_filename)
filepath_analytics = os.path.join(filedir, analytics_filename)
raw_data_medium = pa.read_csv(filepath_medium)
raw_data_toward = pa.read_csv(filepath_toward)
raw_data_analytics = pa.read_csv(filepath_analytics)

#### For removing emojis from text

In [37]:
import demoji 
demoji.download_codes() #(Required first time )

[33mDownloading emoji data ...[0m
[92m... OK[0m (Got response in 1.20 seconds)
[33mWriting emoji data to C:\Users\BABI\.demoji/codes.json ...[0m
[92m... OK[0m


#### Convert numbers to words

In [38]:
ones = ["", "one ","two ","three ","four ", "five ", "six ","seven ","eight ","nine ","ten ","eleven ",
        "twelve ", "thirteen ", "fourteen ", "fifteen ","sixteen ","seventeen ", "eighteen ","nineteen "]
twenties = ["","","twenty ","thirty ","forty ", "fifty ","sixty ","seventy ","eighty ","ninety "]
thousands = ["","thousand ","million ", "billion ", "trillion ", "quadrillion ", "quintillion ", "sextillion ",
             "septillion ","octillion ", "nonillion ", "decillion ", "undecillion ", "duodecillion ", "tredecillion ",
             "quattuordecillion ", "quindecillion", "sexdecillion ", "septendecillion ", "octodecillion ", "novemdecillion ",
             "vigintillion "]
def num999(n):
    c = int(n % 10) # singles digit
    b = int(((n % 100) - c) / 10) # tens digit
    a = int(((n % 1000) - (b * 10) - c) / 100) # hundreds digit
    t = ""
    h = ""
    if a != 0 and b == 0 and c == 0:
        t = ones[a] + "hundred "
    elif a != 0:
        t = ones[a] + "hundred and "
    if b <= 1:
        h = ones[n%100]
    elif b > 1:
        h = twenties[b] + ones[c]
    st = t + h
    return st
def num2word(num):
    if num == 0: return 'zero'
    i = 3
    n = str(num)
    word = ""
    k = 0
    while(i == 3):
        nw = n[-i:]
        n = n[:-i]
        if int(nw) == 0:
            word = num999(int(nw)) + thousands[int(nw)] + word
        else:
            word = num999(int(nw)) + thousands[k] + word
        if n == '':
            i = i+1
        k += 1
    return word[:-1]

In [39]:
def subs(word):
    number = word.group(0)
    return (num2word(number))

#### Convert numbers to words

In [40]:
import re
def find_numbers_percent(word):
    return re.sub('[\d]+',subs,word)

In [41]:
APPO = {
"aren't" : "are not",
"can't" : "cannot",
"couldn't" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "I would",
"i'd" : "I had",
"i'll" : "I will",
"i'm" : "I am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "I have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"that's" : "that is",
"there's" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"we'll" : "we will"   ,
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not",
"tryin'":"trying"
}

In [42]:
tokenizer = TweetTokenizer()
lem = WordNetLemmatizer()
engstopwords = (stopwords.words('english'))
punc = string.punctuation

extra_stopwords = ['important' ,'ai' ,'ml' ,'blackbelt' ,'program' ,'enrollments','open','seventh','april']
for word in extra_stopwords:
    engstopwords.append(word)

engstopwords = set(engstopwords)

In [43]:
emo = {'):','(:',':',':-)',':))'} #For removing bracket emojis

In [44]:
def data_cleaning(data):
    #Remove email
    re_email = re.compile(r'[\w.-]+@[\w.-]+')
    data= re_email.sub(r'',data)
    #Remove Emoji
    data = demoji.replace(data,repl='')
    #Remove webiste
    reg_website = re.compile(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([\w+-a-zA-Z0-9()@:%_\+.~#?&//=]*)')
    data= reg_website.sub(r'',data)
    #convert lower case
    data = data.lower()
    #convert numbers to words
    data = find_numbers_percent(data)
    #convert percent
    data = re.sub(r'%','percent',data)
    data = re.sub(r'’',"'",data)
    data = re.sub(r'\.',"",data)
    data = re.sub(r'-',"",data)
    data = re.sub(r'–',"",data)
    data = re.sub(r'”',"",data)
    data = re.sub(r':',"",data)
    data = re.sub(r'‘',"",data)
    data = re.sub(r'“',"",data)
    # tokenize words
    words = tokenizer.tokenize(data)
    # add appos
    words = [APPO[word] if word in APPO else word for word in words]
    # stop words
    words = [word for word in words if not word in engstopwords]
    words = [lem.lemmatize(word,'v') for word in words]
    #Remove Punctuanitions
    #clean_data = [str('') if word == '.' else word for word in words]
    clean_data = [word for word in words if word not in punc]
    #clean_data = [word for word in words if wor]
    clean_data = [word for word in clean_data if word not in emo]
    cleaned_data =  " ".join(clean_data)
    
    #cleaned_data = re.sub(r'.','dot',cleaned_data)
    cleaned_data = re.sub(r"'",'',cleaned_data)
    return cleaned_data

In [45]:
raw_data_toward['Description'] = raw_data_toward['Description'].apply(lambda x:data_cleaning(x))
raw_data_medium['Description'] = raw_data_medium['Description'].apply(lambda x:data_cleaning(x))
raw_data_analytics['Description'] = raw_data_analytics['Description'].apply(lambda x:data_cleaning(x))

#### Store the cleaned data

In [46]:
raw_data_toward.to_csv('cleaned_towards_data_science')
raw_data_medium.to_csv('cleaned_medium')
raw_data_analytics.to_csv('cleaned_analytics')