## Let us import the necessary libraries

In [None]:
import re
import json 
import numpy as np
import pandas as pd
import re, nltk, spacy, string
nlp = spacy.load("en_core_web_sm")
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import pickle
from plotly.offline import plot
import plotly.graph_objects as go
import plotly.express as px

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from pprint import pprint

from textblob import TextBlob
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier

import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns=30


## Loading the data

Lets convert the json data into a dataframe

In [None]:
with open('complaints-2021-05-14_08_16.json', 'r') as fout:
    f = json.load(fout)
df=pd.json_normalize(f)

In [None]:
df.shape

We have 78313 rows and 22 columns in the dataaset. let's analysise the data to remove duplicates and handle missing columns

In [None]:
df.head(10)

## Data preparation

The columns of the dataset are not making sense for readability, let's remove the leading underscore and 'source.' text from column names

In [None]:
cols = df.columns
print(cols)

In [None]:
new_columns = [re.sub("^_", "", col) for col in cols]
new_columns = [re.sub("^source.", "", col) for col in new_columns]
print(new_columns)

In [None]:
df.columns = new_columns
df.head(10)

The text we are expected to process is in the complaint_what_happened column, however there are some records having blank value for this column. Let us pick only those records that are having not null value.

In [None]:
df[df['complaint_what_happened'] != ''].shape


Of the 87k, only 21k have the complaint column populated. The records with blank complaint column do not have any data to be analyzed, let's remove those records and use only the complaint_what_happened column for further processing

In [None]:
df.columns

In [None]:
df = df[df["complaint_what_happened"] != ""]
df.head()

In [None]:
df["complaint_what_happened"] = df["complaint_what_happened"].astype(str)

Let's see if we have any records with duplicate complaint

In [None]:
df = pd.DataFrame(df["complaint_what_happened"])
type(df)

In [None]:
print('Total number of records with duplicate complaint are', df.duplicated().sum())


Let's remove the duplicate records

In [None]:
df.drop_duplicates(inplace=True)
df.shape

## Prepare the text for topic modeling

Once you have removed all the blank complaints, you need to:

* Make the text lowercase
* Remove text in square brackets
* Remove punctuation
* Remove words containing numbers


Once you have done these cleaning operations you need to perform the following:
* Lemmatize the texts
* Extract the POS tags of the lemmatized text and remove all the words which have tags other than NN[tag == "NN"].


Remove the punctuations from the data and make it lowercase

In [None]:
def preprocess_text(complaint):
    complaint = complaint.lower()
    pattern = '[^\w\s]'
    complaint = re.sub(pattern, '', complaint) 
    pattern = '\w*\d\w*'
    complaint = re.sub(pattern, '', complaint) 
    return complaint

In [None]:
df_clean = pd.DataFrame(df['complaint_what_happened'].apply(preprocess_text))
df_clean.head()

Lets now lemmatize the data

In [None]:
def lemmatize_sentence(text):
    sent = []
    doc = nlp(text)
    for token in doc:
        sent.append(token.lemma_)
    return " ".join(sent)

In [None]:
df_clean['complaint_lemmatized'] = df_clean['complaint_what_happened'].apply(lemmatize_sentence)
df_clean.head()

In [None]:
df_clean.shape

Lets remove the pos tags from the data

In [None]:
def pos_tag(complaint):
    sent = []
    blob = TextBlob(complaint)
    sent = [word for (word,tag) in blob.tags if tag=='NN']
    return " ".join(sent)

df_clean["complaint_POS_removed"] = df_clean['complaint_lemmatized'].apply(pos_tag)
df_clean.head()

In [None]:
df_clean

In [None]:
df_clean.shape

## Exploratory data analysis to get familiar with the data.

Write the code in this task to perform the following:

*   Visualise the data according to the 'Complaint' character length
*   Using a word cloud find the top 40 words by frequency among all the articles after processing the text
*   Find the top unigrams,bigrams and trigrams by frequency among all the complaints after processing the text. ‘




In [None]:
char_len = [len(each_sent) for each_sent in df_clean['complaint_POS_removed']]

In [None]:
sns.displot(char_len, kind='hist', bins=60)
plt.xlabel("character length of complaints")
plt.ylabel("Total number of Complaints")
plt.show()

#### Find the top 40 words by frequency among all the articles after processing the text.

In [None]:
stopwords = set(STOPWORDS)
wordcloud = WordCloud(max_font_size=60, max_words=40, 
                      background_color="white", random_state=100, 
                      stopwords=stopwords).generate(str(df_clean['complaint_POS_removed']))
plt.figure(figsize=[20,20])
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

We can see the words such as credit, card, account, bank, ticket, agent, cardmember being in the top 40 words have have occured the most in the complaints

In [None]:
df_clean['Complaint_clean'] = df_clean['complaint_POS_removed'].str.replace('-PRON-', '')
df_clean.shape

#### Find the top unigrams,bigrams and trigrams by frequency among all the complaints after processing the text.

In [None]:
# Define a function to get word / words based on ngram frequency
def get_top_ngrams(text, n=None, ngram=(1,1)):
    vec = CountVectorizer(stop_words='english', ngram_range=ngram).fit(text)
    bagofwords = vec.transform(text)
    sum_words = bagofwords.sum(axis=0)
    words_frequency = [(word, sum_words[0, index]) for word, index in vec.vocabulary_.items()]
    words_frequency = sorted(words_frequency, key = lambda x: x[1], reverse=True)
    return words_frequency[:n]

In [None]:
# Define a function to show the words frequency
def display_ngram_distribution(data):
    plt.figure(figsize=[20,6])
    sns.barplot(x=data['words'], y=data['count'])
    plt.xticks(rotation=45)
    plt.xlabel("Unigram")
    plt.ylabel("Count")
    plt.title("Count of top 30 Unigrams")
    plt.show()

Let's fetch the top 30 unigram words frequencies from the complaints dataset

In [None]:
top_30words = get_top_ngrams(df_clean['Complaint_clean'].values.astype('U'), n=30, ngram=(1,1))
df_unigram = pd.DataFrame(top_30words, columns=['words', 'count'])
df_unigram

Top 10 words in the unigram frequency are

In [None]:
top_10 = df_unigram.sort_values(by = "count", ascending=False).iloc[:10]
top_10.head(15)

In [None]:
display_ngram_distribution(top_10)

Let us find top 30 bigram word frequencies in the dataset

In [None]:
top_30words = get_top_ngrams(df_clean['Complaint_clean'].values.astype('U'), n=30, ngram=(2,2))
df_bigram = pd.DataFrame(top_30words, columns=['words', 'count'])
df_bigram

In [None]:
top_10 = df_bigram.sort_values(by = "count", ascending=False).iloc[:10]
top_10.head(15)

In [None]:
display_ngram_distribution(top_10)

In [None]:
top_30words = get_top_ngrams(df_clean['Complaint_clean'].values.astype('U'), n=30, ngram=(3,3))
df_trigram = pd.DataFrame(top_30words, columns=['words', 'count'])
df_trigram

In [None]:
top_10 = df_trigram.sort_values(by = "count", ascending=False).iloc[:10]
top_10.head(15)

In [None]:
display_ngram_distribution(top_10)

## The personal details of customer has been masked in the dataset with xxxx. Let's remove the masked text as this will be of no use for our analysis

In [None]:
df_clean['Complaint_clean'] = df_clean['Complaint_clean'].str.replace('xxxx','')
df_clean.head()

In [None]:
df_clean.shape

## Feature Extraction
Convert the raw texts to a matrix of TF-IDF features

**max_df** is used for removing terms that appear too frequently, also known as "corpus-specific stop words"
max_df = 0.95 means "ignore terms that appear in more than 95% of the complaints"

**min_df** is used for removing terms that appear too infrequently
min_df = 2 means "ignore terms that appear in less than 2 complaints"

In [None]:
tfidf = TfidfVectorizer(min_df=2, max_df=0.95, stop_words='english')
df_transformed = tfidf.fit_transform(df_clean['Complaint_clean'])
print(df_transformed)

## Topic Modelling using NMF

In [None]:
from sklearn.decomposition import NMF

In [None]:
num_topics = 5
nmf_model = NMF(n_components=num_topics)

In [None]:
nmf_model.fit(df_transformed)
len(tfidf.get_feature_names_out())

In [None]:
H = nmf_model.components_    # Topic-term matrix
print(H.shape)

In [None]:
#Print the Top15 words for each of the topics
words = np.array(tfidf.get_feature_names_out())
topic_words = pd.DataFrame(np.zeros((num_topics, 15)), index=[f'Topic {i + 1}' for i in range(num_topics)],
                           columns=[f'Word {i + 1}' for i in range(15)]).astype(str)
for i in range(num_topics):
    ix = H[i].argsort()[::-1][:15]
    topic_words.iloc[i] = words[ix]

topic_words

**From observing above table and the top 15 words being identified against the topic, we can assign below categories to the topics**

- Topic 1 = Bank account services
- Topic 2 = Credit card / Prepaid card
- Topic 3 = Others
- Topic 4 = Theft/Dispute reporting
- Topic 5 = Mortgages/loans

In [None]:
topic_results = nmf_model.transform(df_transformed)
print(topic_results)

In [None]:
df_clean['Topic'] = topic_results.argmax(axis=1)
df_clean.head()

In [None]:
# Let's print the first 5 Complaint for each of the Topics
df_clean_grouped=df_clean.groupby('Topic').head(5)
df_clean_grouped.sort_values('Topic')

#### After evaluating the mapping, if the topics assigned are correct then assign these names to the relevant topic:
* Bank Account services
* Credit card or prepaid card
* Theft/Dispute Reporting
* Mortgage/Loan
* Others

In [None]:
Topic_names = {  0 : "Bank Account services", 1: "Credit card or prepaid card", 2: "Theft/Dispute Reporting", 3: "Mortgage/Loan", 4: "Others" }
df_clean['Topic'] = df_clean['Topic'].map(Topic_names)

In [None]:
df_clean

## Supervised model to predict any new complaints to the relevant Topics.

You have now build the model to create the topics for each complaints.Now in the below section you will use them to classify any new complaints.

Since you will be using supervised learning technique we have to convert the topic names to numbers(numpy arrays only understand numbers)

In [None]:
Topic_names = {  "Bank Account services": 0, "Credit card or prepaid card": 1, "Theft/Dispute Reporting": 2, "Mortgage/Loan": 3, "Others": 4 }

df_clean['Topic'] = df_clean['Topic'].map(Topic_names)

In [None]:
df_clean

In [None]:
training_data= df_clean[["complaint_what_happened","Topic"]]

In [None]:
training_data

#### Apply the supervised models on the training data created. In this process, you have to do the following:
* Create the vector counts using Count Vectoriser
* Transform the word vecotr to tf-idf
* Create the train & test data using the train_test_split on the tf-idf & topics


Below code is to convert the sentences into a tfidf matrix which will be used as an input to the Supervised learning model

In [None]:
vect = CountVectorizer()
X_train_cnt = vect.fit_transform(training_data['complaint_what_happened'])
pickle.dump(vect.vocabulary_, open("count_vector.pk1", "wb"))

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_cnt)
pickle.dump(tfidf_transformer, open('tfidf.pk1', "wb"))

### Logistic Regression

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_train_tfidf, training_data['Topic'], test_size=0.25, random_state=42)

In [None]:
logreg = LogisticRegression().fit(X_train, y_train)
logreg.score(X_test, y_test)


With the logistic regression, we got the model accuracy of 91%. Let's save this model before going ahead with Random Forest

In [None]:
pickle.dump(logreg, open("logreg_model.pk1", "wb"))

### Decision Tree classifier

In [None]:
decisionTree = DecisionTreeClassifier(max_depth=6, random_state=42).fit(X_train, y_train)

In [None]:
decisionTree.score(X_test, y_test)

The score we are getting for Decision Tree Classifier with max_depth = 5 is 71%. Let's hypertune some parameters

In [None]:
dt_grid = {"max_depth": [3,5,7,9],
           "min_samples_split": [5,10,15,20,30],
           "min_samples_leaf": [5,10,15,20,30]}

In [None]:
decision_tree_tuned = GridSearchCV(DecisionTreeClassifier(random_state=42),
                                param_grid=dt_grid,
                                cv=5,
                                verbose=True,
                                n_jobs=-1)

decision_tree_tuned.fit(X_train, y_train);

In [None]:
best_params = decision_tree_tuned.best_params_
best_params

In [None]:
decision_tree_modified_params = DecisionTreeClassifier(max_depth=best_params["max_depth"], min_samples_leaf=best_params["min_samples_leaf"], min_samples_split=best_params["min_samples_split"], random_state=42).fit(X_train, y_train)

In [None]:
decision_tree_modified_params.score(X_test, y_test)

With the hyperparameter tuning, we are getting the score as 76%.

Let's save the decision tree model

In [None]:
pickle.dump(decision_tree_modified_params, open("rf_model.pk1", "wb"))

#### Now that we have tried both Logistic Regression and Decision Tree models with 91 & 76 percent accuracy. Let's test these models on a test sample. Here we are creating a sample sentence which will be fed to the model after converting them into tfidf matrix format. Let's see the output we get

In [None]:
sentence = "I am getting error while using credit card at the mall. It got declined twice. I tried changing the pin at ATM but same issue occurred again"
print(sentence)

Now clearly above message is about a customer having issues with using credit card. The correct category for this sentence is Credit card / Prepaid card. The numeric representation for this category should be 1. Let's see the dictionary of numeric vs string categories.

In [None]:
print(Topic_names)

In [None]:
sample_complaint= np.array([sentence])
print(sample_complaint)

In [None]:
X_train_2 = vect.transform(sample_complaint)
X_train_tfidf_2 = tfidf_transformer.transform(X_train_2)


In [None]:
X_train_tfidf_2

In [None]:
y_pred = logreg.predict(X_train_tfidf_2)
y_pred

The logistic regression model has correctly predicted the sentence to be of category 1 which is of category Credit card / Prepaid card

In [None]:
y_pred = decision_tree_modified_params.predict(X_train_tfidf_2)
y_pred

We can see that the decision tree model has also predicted correct category.