In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os
from PIL import Image
from sklearn.feature_extraction.text import TfidfVectorizer
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe"

from sklearn.feature_selection import chi2
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import nltk
import string

In [None]:
tifImageFilePath = 'D:\\Temp\\TrainingData\\'
jpgImageFilePath = 'D:\\Temp\\Final_JPEG\\'
tifTestImageFilePath = 'D:\\Temp\\TestingData\\'
jpgTestImageFilePath = 'D:\\Temp\\Final_Testing_JPEG\\'

In [None]:
#Convert All tif files to jpeg
def ConvertTifToJpeg(tifImageFilePath, jpgImageFilePath):
    for r, d, f in os.walk(tifImageFilePath):
        for file in f:
            if ('.tiff' in str.lower(file)) | ('.tif' in str.lower(file)):
                tifImage = Image.open(tifImageFilePath + file)
                tifImage.convert('RGB').save(jpgImageFilePath + file.split('.')[0] + '.jpeg', 'JPEG')
    return

In [None]:
#Read All jpg files to create data frame
def ConvertJpegToDF(jpgImageFilePath):
    
    tiffDF = pd.DataFrame({'Complaint_Type': [], 'Consumer_Complaint_Description': []})
    complaintType = []
    complaintDescription = []
    for r, d, f in os.walk(jpgImageFilePath):
        for file in f:
            if ('.jpeg' in str.lower(file)):
                jpgImage = Image.open(jpgImageFilePath + file)
                complaintDesciption = FilterEmailContent(jpgImage)
                fileCategory = FileCategory(file)
                complaintType.append(fileCategory)
                complaintDescription.append(complaintDesciption)
                
    tiffDF['Complaint_Type'] = complaintType
    tiffDF['Consumer_Complaint_Description'] = complaintDescription
    RemoveAllJpegs(jpgImageFilePath)
    return tiffDF

In [None]:
# Filter the customer complaint email content
def FilterEmailContent(jpgImage):
    emailContent = pytesseract.image_to_string(jpgImage)
    emailContent1 = pytesseract.image_to_data(jpgImage)
    startBodyKeywords = ['Subject', 'Dear', 'Hello', 'Hi',]
    endBodyKeywords = ['Thank you','Thanks & Regards', 'Sincerely']
    for keyword in startBodyKeywords:
        startIndex = emailContent.find(keyword)
        if startIndex >= 0:
            startIndex = startIndex + len(keyword) + 1
            break
        
    for keyword in endBodyKeywords:
        endIndex =  emailContent.find(keyword)
        if endIndex >= 0:
            #endIndex += len(keyword)
            break
            
    if not (startIndex >= 0):
        startIndex = 0
    if not (endIndex >= 0):
        endIndex = -1
        
    emailContent = emailContent[startIndex : endIndex]
    
    #Remove any email address or Phone Number
    phoneIndex = emailContent.find('Phone:')
    if (phoneIndex > 0):
        emailContent = emailContent[0:phoneIndex] + emailContent[phoneIndex + 19 : -1]
    
    mailIndex = emailContent.find('Mail:')
    mailIndex1 = emailContent.find('.com')
    if ((mailIndex > 0) & (mailIndex1 > 0)):
        emailContent = emailContent[0:mailIndex] + emailContent[mailIndex1 + 4 : -1]
        
    return emailContent

In [None]:
# Remove all the jpgs once content is read
def RemoveAllJpegs(imagePath):
    for r, d, f in os.walk(imagePath):
        for file in f:
            if (('.jpeg' in str.lower(file)) |('.jpg' in str.lower(file))):
                os.remove(os.path.join(r, file))
    return

In [None]:
# Get the file category
def FileCategory(fileName):
    fileSections = fileName.split(' ')
    if (len(fileSections) > 1):
        fileCategory = fileSections[0] + ' ' + fileSections[1][:-6]
    else:
        fileSections = fileName.split('_')
        fileCategory = fileSections[0] + ' ' + fileSections[1]
    return fileCategory

In [None]:
def RemoveStopWords(emails):
    nltk_stopwords = nltk.corpus.stopwords.words('english')
    final_email_content = ''
    listOfContent = []
    for emailContent in emails:
        emailContent = "".join([w for w in emailContent if w not in string.punctuation])
        tokens = nltk.tokenize.word_tokenize(emailContent)
        content = [token for token in tokens if not token in nltk_stopwords]
        final_email_content = ' '.join(content)
        listOfContent.append(final_email_content)
        
    return listOfContent

In [None]:
# Performing preprocessing on the data
def PreprocessData(tiffDF):
    processedTiffDF = tiffDF
    
    cols = ['Complaint_Type', 'Consumer_Complaint_Description']
    df = tiffDF[cols]
    
    #Remove any null values from data on columns
    df = df.dropna(subset=['Consumer_Complaint_Description','Complaint_Type'], how='any')
        
    #Do column encoding for better catogorization  
    df['Complaint_Id'] = df['Complaint_Type'].factorize()[0]
    
    #Remove all new line characters
    df['Consumer_Complaint_Description'] = df['Consumer_Complaint_Description'].replace('\n', ' ', regex = True)
    
    #Remove all new line characters & puntuation
    df['Consumer_Complaint_Description'] = RemoveStopWords(df['Consumer_Complaint_Description'])
    
    #Check imbalanced classes
    fig = plt.figure(figsize=(8,6))
    df.groupby('Complaint_Id').Consumer_Complaint_Description.count().plot.bar(ylim=0)
    plt.show()

    #Remove imbalanced classes if required
    
    processedTiffDF = df
    return processedTiffDF

In [None]:
# Start vectorization 
def Vectorization(processedTiffDF):
    tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
    features = tfidf.fit_transform(processedTiffDF.Consumer_Complaint_Description).toarray()
    labels = processedTiffDF.Complaint_Id
    return features, labels, tfidf
    

In [None]:
# Start TD-IDF Representation
def TFIDFRep(processedDF, features, labels, tfidf):
    N = 2
    complaint_id_df = processedDF[['Complaint_Type', 'Complaint_Id']].drop_duplicates().sort_values('Complaint_Id')
    complaint_to_id = dict(complaint_id_df.values)
    for Complaint_Type, Complaint_Id in sorted(complaint_to_id.items()):
        features_chi2 = chi2(features, labels == Complaint_Id)
        indices = np.argsort(features_chi2[0])
        feature_names = np.array(tfidf.get_feature_names())[indices]
        unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
        bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
        print("# '{}':".format(Complaint_Type))
        print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
        print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))
    return

In [None]:
# Get model params to train model
def GetModelParams(processedDF):
    X_train = processedDF['Consumer_Complaint_Description']
    y_train = processedDF['Complaint_Type']
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(X_train)
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    return X_train_tfidf, y_train, count_vect

In [None]:
# Algo1 - Naive Bayes Classifier
def NaiveBayesClassifier(processedDF):
    X_train_tfidf, y_train, count_vect = GetModelParams(processedDF)
    clf = MultinomialNB().fit(X_train_tfidf, y_train)
    return clf, count_vect

In [None]:
# Algo2 - Support Vector Classification
def LinearSupportVectorClassification(processedDF):
    model = LinearSVC(random_state=0)
    X_train_tfidf, y_train, count_vect = GetModelParams(processedDF)
    model.fit(X_train_tfidf, y_train)    
    return model, count_vect

In [None]:
# Algo3 - Logistic Regression
def SKLogisticRegression(processedDF):
    model = LogisticRegression(random_state=101, solver='lbfgs',multi_class='multinomial')
    X_train_tfidf, y_train, count_vect = GetModelParams(processedDF)
    model.fit(X_train_tfidf, y_train)    
    return model, count_vect

In [None]:
# Algo4 - Random Forest
def SKRandomForest(processedDF):
    model = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=101)
    X_train_tfidf, y_train, count_vect = GetModelParams(processedDF)
    model.fit(X_train_tfidf, y_train)    
    return model, count_vect

In [None]:
# Preparing Pre-Training data for training
ConvertTifToJpeg(tifImageFilePath, jpgImageFilePath)
df = ConvertJpegToDF(jpgImageFilePath)
processedDF = PreprocessData(df)
#features,labels,tfidf = Vectorization(processedDF)
#Represention words in vector for each category for uni-grams & bi-grams
#TFIDFRep(processedDF, features, labels, tfidf)

In [None]:
# Preparing Pre-Testing data for testing
ConvertTifToJpeg(tifTestImageFilePath, jpgTestImageFilePath)
testDF = ConvertJpegToDF(jpgTestImageFilePath)

#Clean the Data
#remove all new line characters
testDF['Consumer_Complaint_Description'] = testDF['Consumer_Complaint_Description'].replace('\n', ' ', regex = True)
    
#remove all stop words & Puntuation
testDF['Consumer_Complaint_Description'] = RemoveStopWords(testDF['Consumer_Complaint_Description'])

y_test = []
for index, row in testDF.iterrows():
    #print('Complaint Type: ' + row['Complaint_Type'])
    y_test.append(row['Complaint_Type'])
    #stringVal = '"' + row['Consumer_Complaint_Description'] + '"'

In [None]:
# Get the predictions from Naive Bayes
y_pred = []
NBClassifier, count_vect = NaiveBayesClassifier(processedDF)
y_pred = list(NBClassifier.predict(count_vect.transform(testDF['Consumer_Complaint_Description'])))
print("Accuracy Score: ")
print(accuracy_score(y_test, y_pred) * 100)
confidence_Score = NBClassifier.predict_proba(count_vect.transform(testDF['Consumer_Complaint_Description']))

for prediction, cScores, in zip(y_pred, confidence_Score):
    print('Prediction: ' + prediction + '\nConfidence Score: ' + str(max(cScores) * 100) + '%\n')

In [None]:
# Get the predictions from Support Vector Machine
SVCClassifier, count_vect = LinearSupportVectorClassification(processedDF)
y_pred = SVCClassifier.predict(count_vect.transform(testDF['Consumer_Complaint_Description']))
print('Predictions: ')
print(y_pred)
print("Accuracy Score: ")
print(accuracy_score(y_test, y_pred) * 100)
confidence_Score = NBClassifier.predict_proba(count_vect.transform(testDF['Consumer_Complaint_Description']))

for prediction, cScores, in zip(y_pred, confidence_Score):
    print('Prediction: ' + prediction + '\nConfidence Score: ' + str(max(cScores) * 100) + '%\n')

In [None]:
# Get the predictions from LogisticRegression
LogisticRegressionClassifier, count_vect = SKLogisticRegression(processedDF)
y_pred = LogisticRegressionClassifier.predict(count_vect.transform(testDF['Consumer_Complaint_Description']))
print('Predictions: ')
print(y_pred)
print("Accuracy Score: ")
print(accuracy_score(y_test, y_pred) * 100)
confidence_Score = NBClassifier.predict_proba(count_vect.transform(testDF['Consumer_Complaint_Description']))

for prediction, cScores, in zip(y_pred, confidence_Score):
    print('Prediction: ' + prediction + '\nConfidence Score: ' + str(max(cScores) * 100) + '%\n')

In [None]:
# Get the predictions from Random Forest
RandomForestClassifier, count_vect = SKRandomForest(processedDF)
y_pred = RandomForestClassifier.predict(count_vect.transform(testDF['Consumer_Complaint_Description']))
print('Predictions: ')
print(y_pred)
print("Accuracy Score: ")
print(accuracy_score(y_test, y_pred) * 100)
confidence_Score = NBClassifier.predict_proba(count_vect.transform(testDF['Consumer_Complaint_Description']))

for prediction, cScores, in zip(y_pred, confidence_Score):
    print('Prediction: ' + prediction + '\nConfidence Score: ' + str(max(cScores) * 100) + '%\n')

In [None]:
# Publish the full metrics report 
from sklearn import metrics
print(metrics.classification_report(y_test, y_pred, target_names=df['Complaint_Type'].unique()))