In [1]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv("complaint_and_topics.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,complaint,type
0,0,Debt collection Credit card debt,5
1,1,Good morning my name is XXXX XXXX and I apprec...,1
2,2,I upgraded my XXXX XXXX card in XX/XX/2018 and...,3
3,3,Mortgage Conventional home mortgage,5
4,4,Credit card or prepaid card General-purpose c...,4


In [5]:
import keras
import nltk, re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [6]:


def preprocess(texts):
    
    texts = [re.findall(r'\w+', line.lower()) for line in texts]
    # remove stopwords
    texts = [remove_stopwords(' '.join(line)).split() for line in texts]
    # remove punctuation
    texts = [strip_punctuation(' '.join(line)).split() for line in texts]
    # remove words that are only one or two characters
    texts = [[token for token in line if len(token) >2] for line in texts]
    # remove numbers
    texts = [[token for token in line if not token.isnumeric()] for line in texts]
    # lemmatization
    lemmatizer = WordNetLemmatizer()
    texts = [[word for word in lemmatizer.lemmatize(' '.join(line), pos='v').split()] for line in texts]
    texts = [" ".join(line) for line in texts]

    return texts

In [7]:
processed_data = preprocess(df["complaint"])

In [8]:
processed_data

['debt collection credit card debt',
 'good morning xxxx xxxx appreciate help stop chase bank cardmember services wrote chase asking debt verification sent statement acceptable asking bank validate debt instead receiving mail month attempting collect debt right know information consumer chase account xxxx xxxx xxxx xxxx thanks advance help debt collection credit card debt',
 'upgraded xxxx xxxx card told agent upgrade anniversary date change turned agent giving wrong information order upgrade account xxxx changed anniversary date xxxx xxxx consent xxxx recording agent misled credit card prepaid card general purpose credit card charge card',
 'mortgage conventional home mortgage',
 'credit card prepaid card general purpose credit card charge card',
 'checking savings account checking account',
 'checking savings account checking account',
 'mortgage conventional home mortgage',
 'checking savings account checking account',
 'checking savings account checking account',
 'chase card repor

In [9]:
from keras.preprocessing.text import one_hot

In [10]:
voc_size=10000

In [12]:
encoded = [one_hot(words, voc_size) for words in processed_data]

In [13]:
encoded

[[782, 7305, 3348, 1776, 782],
 [9171,
  6297,
  9562,
  9562,
  525,
  3447,
  7797,
  6650,
  2797,
  9782,
  7772,
  4810,
  6650,
  4025,
  782,
  4908,
  2913,
  6855,
  5289,
  4025,
  2797,
  8350,
  782,
  3894,
  1893,
  1386,
  4544,
  7721,
  2111,
  782,
  8439,
  4134,
  1935,
  2987,
  6650,
  3107,
  9562,
  9562,
  9562,
  9562,
  984,
  6510,
  3447,
  782,
  7305,
  3348,
  1776,
  782],
 [7373,
  9562,
  9562,
  1776,
  4725,
  4441,
  1576,
  6432,
  5306,
  6936,
  9549,
  4441,
  7939,
  4533,
  1935,
  2182,
  1576,
  3107,
  9562,
  5992,
  6432,
  5306,
  9562,
  9562,
  6302,
  9562,
  2512,
  4441,
  6226,
  3348,
  1776,
  5440,
  1776,
  3477,
  2395,
  3348,
  1776,
  9407,
  1776],
 [4438, 6545, 4702, 4438],
 [3348, 1776, 5440, 1776, 3477, 2395, 3348, 1776, 9407, 1776],
 [2327, 9571, 3107, 2327, 3107],
 [2327, 9571, 3107, 2327, 3107],
 [4438, 6545, 4702, 4438],
 [2327, 9571, 3107, 2327, 3107],
 [2327, 9571, 3107, 2327, 3107],
 [6650,
  1776,
  4923,
  796

In [14]:
from keras.layers import Embedding, Dense, Flatten
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer

In [15]:
padded_encoded = pad_sequences(encoded)

In [17]:
t = Tokenizer()
t.fit_on_texts(processed_data)

In [18]:

vector_size = 200

In [20]:


embeddings_index = dict()   # A dictionary that will hold word and mapped numeric vector
f = open('glove.6B.200d.txt', encoding="utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [21]:
matrix_size = len(t.word_index)+1
embedding_matrix = np.zeros((matrix_size, vector_size))
for word,i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [22]:
print(embedding_matrix)

[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.39943999  0.70542002 -0.075412   ... -0.0077033   0.044716
   0.78061998]
 [ 0.42771    -0.18483999  0.40634    ...  0.50226003  0.18767001
   0.32892999]
 ...
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.025712    0.20636    -0.61913002 ...  0.52411997  0.82837999
  -0.20194   ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]


In [23]:
input_len = len(padded_encoded[1])


In [24]:
input_len

3004

# Model building using Supervised learning

In [25]:
X = padded_encoded
y = pd.get_dummies(df["type"])

In [26]:
X.shape

(78313, 3004)

In [27]:
y.shape

(78313, 5)

In [28]:
model = Sequential()
model.add(Embedding(matrix_size,vector_size,weights=[embedding_matrix],input_length=input_len,trainable=False))
model.add(Flatten())
model.add(Dense(128,activation="relu"))
model.add(Dense(64,activation="relu"))
model.add(Dense(5,activation="softmax"))
model.compile(optimizer="rmsprop",loss="categorical_crossentropy",metrics=["accuracy"])

In [29]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 3004, 200)         5593200   
_________________________________________________________________
flatten_1 (Flatten)          (None, 600800)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               76902528  
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_3 (Dense)              (None, 5)                 325       
Total params: 82,504,309
Trainable params: 76,911,109
Non-trainable params: 5,593,200
_________________________________________________________________


# Model Training and evaluation

In [30]:
from sklearn.model_selection import train_test_split

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [None]:
model.fit(X_train,y_train,batch_size=50,epochs=10,validation_data=(X_test,y_test))

Train on 62650 samples, validate on 15663 samples
Epoch 1/10
Epoch 2/10

In [None]:
model.save("embedding_model_evaluation")

In [None]:
loss, accuracy = model.evaluate(X,y)

In [None]:
test_doc = ["I checked my bank account and my balance drop suddenly by 50,000 rupess i need your bank services"]

In [None]:
encoded_test_doc = t.texts_to_sequences(test_doc)

In [None]:
encoded_test_doc

In [None]:
padded_encoded_doc = pad_sequences(encoded_test_doc,maxlen=input_len)

In [None]:
model.predict(padded_encoded_doc)