# Text cleaning for the Data Science Challenge 

In [98]:
#return Python version
from platform import python_version
print (python_version())

3.6.3


In [1]:
###########################################
########### Import Libraries ##############
###########################################
import pandas as pd
import os
import re         
import codecs
import string
import subprocess 
import nltk
from nltk.stem.snowball import EnglishStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import TreebankWordTokenizer
from string import digits
from nltk.corpus import stopwords
import html

In [2]:
###########################################
############## Import Data ################
###########################################
#the ppt extracted data
data_path = "C:/Users/houeslat/Documents/DS interne/Challenges/Challenge_v0/results.csv"
pwd = os.getcwd()
os.chdir(os.path.dirname(data_path))
data = pd.read_csv(os.path.basename(data_path),sep=';')
os.chdir(pwd)

In [91]:
data.head()

Unnamed: 0,file_name,Client_overview,Issue,Solution,Benefit,Other details
0,2083_E_OMV_BI_Landscape_130917_DT.pdf,,\nissue\n\xef\x82\xa7\xef\x82\xa7 historically...,\nsolution\n\xef\x82\xa7\xef\x82\xa7 focus\nfo...,\nbenefits\n\nref\n(da\nkm\n(da\neng\nnam\ncli...,
1,2052_E_Oil&Gas_BI_Cost_Savings_Elaboration1310...,,\nissue\n\xef\x82\xa7\xef\x82\xa7\n\n\xef\x82\...,\n,\nsolution\nbenefits\nsolution\n\xef\x82\xa7\x...,
2,2046_E_2_OMV_BI_ControllingLandscape_130903.pdf,,\nissue\n\xef\x82\xa7\xef\x82\xa7 deal\ndeal w...,\nsolution\n\xef\x82\xa7\xef\x82\xa7 developme...,\nbenefits\n\xef\x82\xa7\xef\x82\xa7 consisten...,
3,Boeing.pdf,,s\nthe boeing enterprise issued an rfp for a s...,s\n:while boeing is not allowing capgemini to ...,/results\ncapgemini\xe2\x80\x99s contract has ...,\n\nclient name: boeing\ntop line initiative (...


In [45]:
###########################################
##### Define text cleaning function #######
###########################################
def text_cleaning(text,escape_list=[]):
    l=[]
    """
    Text cleaning function:
        Input: 
            -text: a string variable, the text to be cleaned
            -escape_list : words not to transform by the cleaning process (only lowcase transformation is needed)  
        Output:
            -text cleaned and stemmed           
    """
    
    
    """ Get stop word list from package"""
    StopWords = list(set(stopwords.words('english')))
    
    """ Step 1: Parse html entities"""
    text = html.unescape(text)
    
    """ Step 2: Decode special caracters"""
    text = text.encode('utf8').decode('unicode_escape')
    
 
    """ Step 3: Tokenise text: spliting text elements with the TreeBankWordTokenizer method"""
    tokenizer = TreebankWordTokenizer()
    tokenz=[','.join(tokenizer.tokenize(mot)) if mot  not in escape_list else mot  for mot in text.split()  ]
    
    
    """ Step 4: Drop punctuations """
    tokenz=[re.sub(r'[^\w\s]',' ',mot) if mot  not in escape_list else mot  for mot in tokenz  ]
    tokenz = ' '.join(tokenz).split()
       
    """ Step 5.1: Remove stop words """
    tokenz=([token for token in tokenz if token not in StopWords])
    
    
    """ Step 5.2: Delete digits from text """
    tokenz=([token for token in tokenz if (  (token.isdigit())==False)  ])  

    """ Step 5.3: Remove digits from tokens"""
    remove_digits = str.maketrans('', '', digits)
    tokenz=[token.translate(remove_digits)  if token not in  escape_list else token for token in tokenz   ]
    
    """ Step 6.1: Lowcase the text"""
    tokenz=([token.lower() for token in tokenz])
    
    """ Step 6.2: Stem the text """
    tokenz=[EnglishStemmer().stem(token) if token not in escape_list else token for token in tokenz ]

    """ Step 6.3: Drop words with one caratcter and proceed last check for stop words after Stemming"""
    tokenz=[token for token in tokenz if (token not in  StopWords and len(token)>1) ]

    return ' '.join(tokenz)

In [80]:
#pick a text example from the ppt data extraction
text = data['Issue'][0]
text

'\\nissue\\n\\xef\\x82\\xa7\\xef\\x82\\xa7 historically\\nhistorically grown\\ngrown\\ncontrolling\\nbi\\ncontrolling bi\\nlandscape\\nlandscape lacking\\nlacking aa\\nclear\\nstrategy\\nclear strategy and\\nand\\npositioning\\nof\\ntools\\npositioning of tools\\n\\xef\\x82\\xa7\\xef\\x82\\xa7 data\\ndata modeling\\nmodeling //\\nmaster\\nmaster data\\ndata\\nmaintenance\\nmaintenance rights\\nrights\\ndistributed\\nintensively\\ndistributed intensively\\nacross\\nacross the\\nthe user\\nuser\\ncommunity\\ncommunity\\n\\xef\\x82\\xa7\\xef\\x82\\xa7 no\\nno standards\\nstandards and\\nand\\nclear\\ncontrol\\nfor\\nclear control for data\\ndata\\ndefinitions/\\ndefinitions/ structure/\\nstructure/\\nsources\\nsources (e.g.\\n(e.g. different\\ndifferent\\nnaming\\nconventions\\nnaming conventions\\nfor\\nfor same\\nsame kpis)\\nkpis)\\n\\xef\\x82\\xa7\\xef\\x82\\xa7 consequently,\\nconsequently, high\\nhigh\\nredundancies\\nredundancies in\\nin data\\ndata\\ncubes\\nand\\ncubes and\\ninco

In [46]:
#The output result
text_cleaning(text,[])

'issu histor histor grown grown control bi control bi landscap landscap lack lack aa clear strategi clear strategi posit tool posit tool data data model model master master data data mainten mainten right right distribut intens distribut intens across across user user communiti communiti standard standard clear control clear control data data definit definit structur structur sourc sourc differ differ name convent name convent kpis kpis consequ consequ high high redund redund data data cube cube inconsist inconsist name data lead name data lead ineffici ineffici report report solut'

In [19]:
#Adding elements to the escape list
text_cleaning(text,["distributed","strategy","inefficiencies"])

'issu histor histor grown grown control bi control bi landscap landscap lack lack aa clear strategy clear strategy posit tool posit tool data data model model master master data data mainten mainten right right distributed intens distributed intens across across user user communiti communiti standard standard clear control clear control data data definit definit structur structur sourc sourc differ differ name convent name convent kpis kpis consequ consequ high high redund redund data data cube cube inconsist inconsist name data lead name data lead inefficiencies inefficiencies report report solut'

# Step 1:  Escape HTML Caracters

In [81]:
#add some htlm entities to the text
escape_list = []
text = text[:551] + ' &lt; &gt; &amp; R2D2 21'
text

'\\nissue\\n\\xef\\x82\\xa7\\xef\\x82\\xa7 historically\\nhistorically grown\\ngrown\\ncontrolling\\nbi\\ncontrolling bi\\nlandscape\\nlandscape lacking\\nlacking aa\\nclear\\nstrategy\\nclear strategy and\\nand\\npositioning\\nof\\ntools\\npositioning of tools\\n\\xef\\x82\\xa7\\xef\\x82\\xa7 data\\ndata modeling\\nmodeling //\\nmaster\\nmaster data\\ndata\\nmaintenance\\nmaintenance rights\\nrights\\ndistributed\\nintensively\\ndistributed intensively\\nacross\\nacross the\\nthe user\\nuser\\ncommunity\\ncommunity\\n\\xef\\x82\\xa7\\xef\\x82\\xa7 no\\nno standards\\nstandards and\\nand\\nclear\\ncontrol &lt; &gt; &amp; R2D2 21'

In [82]:
text = html.unescape(text)
text

'\\nissue\\n\\xef\\x82\\xa7\\xef\\x82\\xa7 historically\\nhistorically grown\\ngrown\\ncontrolling\\nbi\\ncontrolling bi\\nlandscape\\nlandscape lacking\\nlacking aa\\nclear\\nstrategy\\nclear strategy and\\nand\\npositioning\\nof\\ntools\\npositioning of tools\\n\\xef\\x82\\xa7\\xef\\x82\\xa7 data\\ndata modeling\\nmodeling //\\nmaster\\nmaster data\\ndata\\nmaintenance\\nmaintenance rights\\nrights\\ndistributed\\nintensively\\ndistributed intensively\\nacross\\nacross the\\nthe user\\nuser\\ncommunity\\ncommunity\\n\\xef\\x82\\xa7\\xef\\x82\\xa7 no\\nno standards\\nstandards and\\nand\\nclear\\ncontrol < > & R2D2 21'

# Step 2: Decode encoded Caracters

In [83]:
text = text.encode('utf8').decode('unicode_escape')
text

'\nissue\nï\x82§ï\x82§ historically\nhistorically grown\ngrown\ncontrolling\nbi\ncontrolling bi\nlandscape\nlandscape lacking\nlacking aa\nclear\nstrategy\nclear strategy and\nand\npositioning\nof\ntools\npositioning of tools\nï\x82§ï\x82§ data\ndata modeling\nmodeling //\nmaster\nmaster data\ndata\nmaintenance\nmaintenance rights\nrights\ndistributed\nintensively\ndistributed intensively\nacross\nacross the\nthe user\nuser\ncommunity\ncommunity\nï\x82§ï\x82§ no\nno standards\nstandards and\nand\nclear\ncontrol < > & R2D2 21'

# Step 3: Text Tokenization

In [84]:
tokenizer = TreebankWordTokenizer()
tokenz=[','.join(tokenizer.tokenize(mot)) if mot  not in escape_list else mot  for mot in text.split()  ]
print(tokenz)

['issue', 'ï\x82§ï\x82§', 'historically', 'historically', 'grown', 'grown', 'controlling', 'bi', 'controlling', 'bi', 'landscape', 'landscape', 'lacking', 'lacking', 'aa', 'clear', 'strategy', 'clear', 'strategy', 'and', 'and', 'positioning', 'of', 'tools', 'positioning', 'of', 'tools', 'ï\x82§ï\x82§', 'data', 'data', 'modeling', 'modeling', '//', 'master', 'master', 'data', 'data', 'maintenance', 'maintenance', 'rights', 'rights', 'distributed', 'intensively', 'distributed', 'intensively', 'across', 'across', 'the', 'the', 'user', 'user', 'community', 'community', 'ï\x82§ï\x82§', 'no', 'no', 'standards', 'standards', 'and', 'and', 'clear', 'control', '<', '>', '&', 'R2D2', '21']


In [65]:
print('Ã¯Â\x82Â§Ã¯Â\x82Â§')

Ã¯ÂÂ§Ã¯ÂÂ§


# Step 4: Remove punctuations

In [85]:
#The regex wqill also get rid of other special caracters
tokenz=[re.sub(r'[^\w\s]',' ',mot) if mot  not in escape_list else mot  for mot in tokenz  ]
print(tokenz)    

['issue', 'ï  ï  ', 'historically', 'historically', 'grown', 'grown', 'controlling', 'bi', 'controlling', 'bi', 'landscape', 'landscape', 'lacking', 'lacking', 'aa', 'clear', 'strategy', 'clear', 'strategy', 'and', 'and', 'positioning', 'of', 'tools', 'positioning', 'of', 'tools', 'ï  ï  ', 'data', 'data', 'modeling', 'modeling', '  ', 'master', 'master', 'data', 'data', 'maintenance', 'maintenance', 'rights', 'rights', 'distributed', 'intensively', 'distributed', 'intensively', 'across', 'across', 'the', 'the', 'user', 'user', 'community', 'community', 'ï  ï  ', 'no', 'no', 'standards', 'standards', 'and', 'and', 'clear', 'control', ' ', ' ', ' ', 'R2D2', '21']


In [86]:
#after droping punctuation and special caracters, we need to remove extra white spaces
tokenz = ' '.join(tokenz).split()
print(tokenz)

['issue', 'ï', 'ï', 'historically', 'historically', 'grown', 'grown', 'controlling', 'bi', 'controlling', 'bi', 'landscape', 'landscape', 'lacking', 'lacking', 'aa', 'clear', 'strategy', 'clear', 'strategy', 'and', 'and', 'positioning', 'of', 'tools', 'positioning', 'of', 'tools', 'ï', 'ï', 'data', 'data', 'modeling', 'modeling', 'master', 'master', 'data', 'data', 'maintenance', 'maintenance', 'rights', 'rights', 'distributed', 'intensively', 'distributed', 'intensively', 'across', 'across', 'the', 'the', 'user', 'user', 'community', 'community', 'ï', 'ï', 'no', 'no', 'standards', 'standards', 'and', 'and', 'clear', 'control', 'R2D2', '21']


# Step 5: Remove stop words

In [87]:
#Get the sto word list from nltk 
StopWords = list(set(stopwords.words('english')))
print(StopWords[:10])

['yourselves', 'she', 'its', 'from', 'while', 'those', 'and', 'been', 'doing', 'd']


In [88]:
tokenz=([token for token in tokenz if token not in StopWords])
print(tokenz)

['issue', 'ï', 'ï', 'historically', 'historically', 'grown', 'grown', 'controlling', 'bi', 'controlling', 'bi', 'landscape', 'landscape', 'lacking', 'lacking', 'aa', 'clear', 'strategy', 'clear', 'strategy', 'positioning', 'tools', 'positioning', 'tools', 'ï', 'ï', 'data', 'data', 'modeling', 'modeling', 'master', 'master', 'data', 'data', 'maintenance', 'maintenance', 'rights', 'rights', 'distributed', 'intensively', 'distributed', 'intensively', 'across', 'across', 'user', 'user', 'community', 'community', 'ï', 'ï', 'standards', 'standards', 'clear', 'control', 'R2D2', '21']


In [89]:
#remove digit from text
tokenz=([token for token in tokenz if (  (token.isdigit())==False)  ]) 
print(tokenz)

['issue', 'ï', 'ï', 'historically', 'historically', 'grown', 'grown', 'controlling', 'bi', 'controlling', 'bi', 'landscape', 'landscape', 'lacking', 'lacking', 'aa', 'clear', 'strategy', 'clear', 'strategy', 'positioning', 'tools', 'positioning', 'tools', 'ï', 'ï', 'data', 'data', 'modeling', 'modeling', 'master', 'master', 'data', 'data', 'maintenance', 'maintenance', 'rights', 'rights', 'distributed', 'intensively', 'distributed', 'intensively', 'across', 'across', 'user', 'user', 'community', 'community', 'ï', 'ï', 'standards', 'standards', 'clear', 'control', 'R2D2']


In [90]:
#remove digit from tokens
remove_digits = str.maketrans('', '', digits)
tokenz = [token.translate(remove_digits)  if token not in  escape_list else token for token in tokenz   ]
print(tokenz)

['issue', 'ï', 'ï', 'historically', 'historically', 'grown', 'grown', 'controlling', 'bi', 'controlling', 'bi', 'landscape', 'landscape', 'lacking', 'lacking', 'aa', 'clear', 'strategy', 'clear', 'strategy', 'positioning', 'tools', 'positioning', 'tools', 'ï', 'ï', 'data', 'data', 'modeling', 'modeling', 'master', 'master', 'data', 'data', 'maintenance', 'maintenance', 'rights', 'rights', 'distributed', 'intensively', 'distributed', 'intensively', 'across', 'across', 'user', 'user', 'community', 'community', 'ï', 'ï', 'standards', 'standards', 'clear', 'control', 'RD']


# Step 6: lowcase and Stem

In [92]:
#Lowcase the text
tokenz=([token.lower() for token in tokenz])
print(tokenz)

['issue', 'ï', 'ï', 'historically', 'historically', 'grown', 'grown', 'controlling', 'bi', 'controlling', 'bi', 'landscape', 'landscape', 'lacking', 'lacking', 'aa', 'clear', 'strategy', 'clear', 'strategy', 'positioning', 'tools', 'positioning', 'tools', 'ï', 'ï', 'data', 'data', 'modeling', 'modeling', 'master', 'master', 'data', 'data', 'maintenance', 'maintenance', 'rights', 'rights', 'distributed', 'intensively', 'distributed', 'intensively', 'across', 'across', 'user', 'user', 'community', 'community', 'ï', 'ï', 'standards', 'standards', 'clear', 'control', 'rd']


In [93]:
#Stem the text 
tokenz=[EnglishStemmer().stem(token) if token not in escape_list else token for token in tokenz ]
print(tokenz)
#you can use Lemmatizer instead 
#see the following article https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html 

['issu', 'ï', 'ï', 'histor', 'histor', 'grown', 'grown', 'control', 'bi', 'control', 'bi', 'landscap', 'landscap', 'lack', 'lack', 'aa', 'clear', 'strategi', 'clear', 'strategi', 'posit', 'tool', 'posit', 'tool', 'ï', 'ï', 'data', 'data', 'model', 'model', 'master', 'master', 'data', 'data', 'mainten', 'mainten', 'right', 'right', 'distribut', 'intens', 'distribut', 'intens', 'across', 'across', 'user', 'user', 'communiti', 'communiti', 'ï', 'ï', 'standard', 'standard', 'clear', 'control', 'rd']


In [94]:
#Drop words with one caratcter and proceed last check for stop words after Stemming"""
tokenz=[token for token in tokenz if (token not in  StopWords and len(token)>1) ]
print(tokenz)

['issu', 'histor', 'histor', 'grown', 'grown', 'control', 'bi', 'control', 'bi', 'landscap', 'landscap', 'lack', 'lack', 'aa', 'clear', 'strategi', 'clear', 'strategi', 'posit', 'tool', 'posit', 'tool', 'data', 'data', 'model', 'model', 'master', 'master', 'data', 'data', 'mainten', 'mainten', 'right', 'right', 'distribut', 'intens', 'distribut', 'intens', 'across', 'across', 'user', 'user', 'communiti', 'communiti', 'standard', 'standard', 'clear', 'control', 'rd']


In [95]:
#final result 
#join all stems in one paragraph
' '.join(tokenz)

'issu histor histor grown grown control bi control bi landscap landscap lack lack aa clear strategi clear strategi posit tool posit tool data data model model master master data data mainten mainten right right distribut intens distribut intens across across user user communiti communiti standard standard clear control rd'

# End!