<div style="text-align: center; display:block">
    <div style="display: inline-block">
        <h1  style="text-align: center">Text Cleaner Module</h1>
        <div style="width:80%; text-align: center"><i>Author:</i> <strong>Soham Mullick</strong> </div>
    </div>
</div>

* This modules is used for the following purpose:
    * Clean the text and create processed_prob_desc column
    * Create Bigrams and create bigrammed_processed_prob_desc column
    
##### Module <b>Input</b> : Raw data with problem description
##### Module <b>Output</b> : Cleaned data with bigrammed and processed columns


## Important Concepts

#### Problem Description: 
The Raw text the user inputs to describe the problem with the product or the service of interest

#### Processed Problem Description:
The cleaned and processed version of the raw text

#### Bigrammed Processed Problem Description:
After generating bigrams on the cleaned and processed version of the problem description

### Importing important modules

In [1]:
#Core modules
import pandas as pd 
import configparser
import logging
import time

#Text processing modules
import regex as re # to do pattern matching with string data
from nltk.corpus import stopwords # to get the nltk list of stopwords
from nltk.stem.wordnet import WordNetLemmatizer # to be used for Lemmatization 
from nltk import word_tokenize # to be used for tokenizing of text

#Gensim modules
from gensim.models import Phrases # To create bigrams
from gensim.models.phrases import Phraser 



### Read Config and Create Logger

In [None]:
# Loading config file
config = configparser.ConfigParser()
config.read('./config.ini')

# Read config file
colName = str(config['Text_cleaner']['Main_cols'])
raw_file=str(config['Text_cleaner']['Raw_file'])
output_file=str(config['Text_cleaner']['Output_file'])
no_stop_words = config['Text_processing']['no_stop_words']
added_stop_list = config['Text_processing']['added_stop_list']

# Create logger file
logging.basicConfig(filename="Text_cleaner_{}.log".format(time.strftime('%b-%d-%Y_%H%M',time.localtime())),level=logging.DEBUG)

### Getting prepared to clean

Defining Functions to load a file

In [None]:
def getFile(fileName):
    try :
        raw_data=pd.read_csv(fileName,encoding='latin-1') #Change the Filename in config to use different Dataset
    except FileNotFoundError:
        print('\n File name not Correct. Please try again')
    return raw_data

def getColumnNames(colName):
    '''
    to get the required list of column names
    '''
    colName= colName.replace(' ','').split(',')
    return colName
    

### Cleaning steps

The following snippets clean up the text

In [None]:
def process(document):
    document=str(document) 
    tokens = [lemmatizer.lemmatize((str(t))) for t in word_tokenize(document) if (str(t)).lower() not in stop_words] #converting words into lemmatized tokens
    output= " ".join(tokens) #Joining the lemmatised tokens
    output = re.sub(r'((?<=[a-z])[A-Z]|(?<!\A)[A-Z](?=[a-z]))', r' \1',output).lower() #splitting camelCase entries
    output =  re.sub(r'\b\d+(?:\.\d+)?\s+', '', output) # Removing all numbers
    output =  re.sub(r"(?<=[a-z])\r?\n"," ", output) # Removing escape character and new line entries
    output = re.sub(r'\W+',r' ',output) #removing special characters
    output =  re.sub(r"'", "", output) # Removing all single quotes
    output =  re.sub(r",", " ", output) # Replacing commas with white space
    output =  re.sub(r"\b[a-zA-Z]\b", "", output) # Removing single letter
    output = re.sub(r"(^| ).( |$)",' ',output) # Removing single character words
    output =  re.sub(r'\w*\d\w*', '', output).strip() #Stripping whitespaces
    return output

### Create Bigram

The following function modules are used to extract and create bigrams from the text

The bigrams are made to a single entity by joining the unigrams by '_'

In [None]:
# Using iterable class to be used by word2vec
class Sentence(object):
    def __init__(self, doc_list):
        self.doc_list = list(doc_list)
       #print("1")
    def __iter__(self):
        for doc in self.doc_list:
            yield str(doc).split()

# Used to identify bigram terms
def bigram_trainer(sentence,common_list):
    bigram = Phrases(sentence,min_count=20,common_terms=common_list)
    bigrammer= Phraser(bigram)
    return bigrammer

# Used to create bigram tokens from unigram words
def bigram_maker(bigrammer,data):
    bigrammed_list=[]
    for i in data:
        bigram_tokens=bigrammer[i]
        bigram_sent=" ".join(bigram_tokens)
        bigrammed_list.append(bigram_sent)
    return bigrammed_list

# Create vocabulary using bigram terms
def bigram_counter(sentence,common_list):
    bigram = Phrases(min_count=phrase_min_count,threshold=bigram_threshold,common_terms=common_list)
    bigram.add_vocab(sentence)
    return bigram.vocab

# Create bigrammed_prob_desc column
def bigram(df,Output_file,common_list=added_stop_list,saveFile=False):
    sentence=Sentence(df['processed_prob_desc'])
    bigrammer=bigram_trainer(sentence,common_list)
    df['bigrammed_processed_prob_desc']=bigram_maker(bigrammer,sentence)
    if saveFile:
        df.to_csv(Output_file,index=False)
    return None

The following attributes decide on how bigrams are going to be created

In [None]:
# Bigram hyper-parameters
phrase_min_count=20     #Lower value gives more bigrams
bigram_threshold=10     #Lower value gives more bigrams

### Stop Words

In [None]:
# Words to be considered
no_stop_words = list(no_stop_words.replace(' ', '').split(','))

# words to be filtered
added_stop_list = list(added_stop_list.replace(' ', '').split(','))

# Load stop-words
stop_words= stopwords.words('english')+added_stop_list
stop_words = set(stop_words)-set(no_stop_words)

### Load Data

In [None]:
raw_data=getFile(raw_file)
colName=getColumnNames(colName)
raw_data=raw_data[colName]
raw_data.head()

### Basic Info about the Dataset

In [None]:
logging.debug('Total No. of cases in raw data '+str(len(raw_data)))
logging.debug('Total No. of class1 in raw data '+str(len(raw_data[raw_data['rma_flag']==0])))
logging.debug('Total No. of class2 in raw data '+str(len(raw_data[raw_data['rma_flag']==1])))

### Text Cleaning Steps

In [None]:
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

#Start Cleaning
raw_data['processed_prob_desc']=raw_data['prob_desc'].apply(lambda x: process(x))

### Clean Output

In [None]:
raw_data.head()

### Create Bigram and Save Output File

In [None]:
bigram(raw_data,output_file,saveFile=True)