# NLP Data Cleaner


This module will clean the text part of each element in a DataFrames Column or List.

Input :
- Column of the dataframe
- remove digit (true/false) ->  to clean all the digits,
- remove punctuation (true/false) -> to clean all the punctuations,
- remove stopwords (true/false) -> to clean all the stopwords,

Output :
DataFrame with column 
- "Cleaned Text" : cleaned text
- "Dates" : all the dates in text.


Usage : 

       nlpDC = NLPDataCleaner()
       res_df = nlpDC.cleanDfColumn(myDf["col_name"], digit = true, punc = true, sw = true)

In [48]:
# Imports used

import pandas as pd
from nltk.corpus import stopwords
import datefinder
import string


In [74]:
class NLPDataCleaner:
    
    '''
    def __init__(self): 
        self.
    '''
    
    def cleanDfColumn(self, dfColumn, digit=True, punc=True, sw=True):
        
        res_df = pd.DataFrame({ 'Cleaned Text': dfColumn
                              })
        
        #filter down the dates in each row
        dateCol = []
        for text in res_df['Cleaned Text']:
            matches = datefinder.find_dates(text)
            dateList = []
            for match in matches:
                dateList.append(match)
            dateCol.append(dateList)

        res_df['Dates'] = dateCol
        
        #remove the digits
        if digit == True:
            res_df['Cleaned Text'] = res_df['Cleaned Text'].replace('\d+', '', regex=True)
        
        #remove the punc
        if punc == True:
            res_df['Cleaned Text'] = res_df['Cleaned Text'].apply(lambda x:''.join([i for i in x 
                                                  if i not in string.punctuation]))
        
        
        #remove the stopwords
        stop = stopwords.words('english')
        
        res_df['Cleaned Text'] = res_df['Cleaned Text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
        
        
        return res_df

# Usage :

In [75]:
myDf = pd.DataFrame({'col_name':['what is my name 11 Nov, 2011 and 11 jan', 'jkdfskkkd11/2/2011', 'jgyutyuedf_kjfjk?jjhgd...12/2/2011', 'jfjfde__13/2/2011']
                    }) 
nlpDC = NLPDataCleaner()
res_df = nlpDC.cleanDfColumn(myDf["col_name"], digit = True, punc = True, sw = True)

In [76]:
print(res_df)

           Cleaned Text                                       Dates
0          name Nov jan  [2011-11-11 00:00:00, 2021-01-11 00:00:00]
1             jkdfskkkd                       [2011-11-02 00:00:00]
2  jgyutyuedfkjfjkjjhgd                       [2011-12-02 00:00:00]
3                jfjfde                       [2011-02-13 00:00:00]
