In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report

In [3]:
df = pd.read_csv('smth.csv')
df

Unnamed: 0,complaint_id,complaint_clean,Label
0,3229299,morning help bank chase debt verification stat...,Bank Account services
1,3199379,card agent upgrade anniversary date change age...,Theft/Dispute Reporting
2,3233499,chase card report application submit identity ...,Credit card / Prepaid card
3,3180294,book offer ticket reward card information minu...,Credit card / Prepaid card
4,3224980,son deposit chase account fund bank account pa...,Bank Account services
...,...,...,...
21067,3094545,chase card customer decade solicitation credit...,Credit card / Prepaid card
21068,3091984,credit card provider ask claim purchase protec...,Theft/Dispute Reporting
21069,3133355,risk consumer chase year trust mobile banking...,Theft/Dispute Reporting
21070,3110963,credit yrs chase credit card chase freedom pro...,Mortgage/Loan management


In [11]:
df.dropna(inplace=True)

In [12]:
X = df['complaint_clean']
y = df['Label']

In [17]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer


count_vect=CountVectorizer()
count = count_vect.fit_transform(X)

tfidf_trans=TfidfTransformer()
X_vec = tfidf_trans.fit_transform(count)

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_vec,y, test_size=0.2)

In [19]:
X_test.shape

(4213, 11387)

In [20]:
X_train.shape

(16848, 11387)

In [21]:
y_train.shape

(16848,)

In [22]:
import pickle
pickle.dump(count_vect, open('count.pkl','wb'))
pickle.dump(tfidf_trans, open('tfidf.pkl','wb'))

In [23]:
# StratifiedKFold ensures that each fold has a representative class distribution, leading to more reliable and consistent evaluation of the model.

def model_build(model, param_grid):
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=40)
    grid = GridSearchCV(model,param_grid={}, cv=cv, scoring='f1_weighted', n_jobs=-1)
    grid.fit(X_train, y_train)
    return grid.best_estimator_

## 1. Logistic Regression

In [24]:
clf = LogisticRegression()
para = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'solver': ['lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [100, 200, 300, 500],
    'class_weight': [None, 'balanced']
}
lr = model_build(clf, para)

In [25]:
y_pred_train = lr.predict(X_train)
y_pred_test = lr.predict(X_test)

print("Classification Report for training data: \n", classification_report(y_train, y_pred_train))
print("Classification Report for testing data: \n",classification_report(y_test, y_pred_test))

Classification Report for training data: 
                             precision    recall  f1-score   support

     Bank Account services       0.99      0.99      0.99      3772
Credit card / Prepaid card       0.99      0.99      0.99      3260
  Mortgage/Loan management       0.99      0.99      0.99      3981
   Theft/Dispute Reporting       0.98      0.99      0.99      3930
                    others       0.99      0.98      0.99      1905

                  accuracy                           0.99     16848
                 macro avg       0.99      0.99      0.99     16848
              weighted avg       0.99      0.99      0.99     16848

Classification Report for testing data: 
                             precision    recall  f1-score   support

     Bank Account services       0.97      0.97      0.97       954
Credit card / Prepaid card       0.97      0.97      0.97       783
  Mortgage/Loan management       0.98      0.97      0.98      1052
   Theft/Dispute Reporting 

## 2. Multinomial Naive Bayes

In [35]:
clf2 = MultinomialNB()
para = {
    'alpha': [0.01, 0.1, 1.0, 10.0]
}
mnb = model_build(clf2, para)

In [36]:
y_pred2_train = mnb.predict(X_train)
y_pred2_test = mnb.predict(X_test)

print("Classification Report for training data: \n", classification_report(y_train, y_pred2_train))
print("Classification Report for testing data: \n",classification_report(y_test, y_pred2_test))

Classification Report for training data: 
                             precision    recall  f1-score   support

     Bank Account services       0.80      0.88      0.84      3772
Credit card / Prepaid card       0.86      0.76      0.81      3260
  Mortgage/Loan management       0.79      0.91      0.85      3981
   Theft/Dispute Reporting       0.71      0.82      0.76      3930
                    others       0.98      0.33      0.49      1905

                  accuracy                           0.79     16848
                 macro avg       0.83      0.74      0.75     16848
              weighted avg       0.81      0.79      0.78     16848

Classification Report for testing data: 
                             precision    recall  f1-score   support

     Bank Account services       0.77      0.87      0.82       954
Credit card / Prepaid card       0.83      0.71      0.77       783
  Mortgage/Loan management       0.79      0.91      0.84      1052
   Theft/Dispute Reporting 

## 3. Decision Tree classifier

In [37]:
clf3 = DecisionTreeClassifier()
para = {
    'criterion' : ["gini", "entropy", "log_loss"],
    'splitter' : ["best", "random"],
    'max_depth' : [None, 2, 4, 6, 8, 10, 12, 14, 16],
    'min_samples_split': [2, 4, 6, 8, 10,15],
    'min_samples_leaf': [1, 2, 4, 6, 8, 10,12],
    'max_features' : [None, "sqrt", "log2"],
}
dt = model_build(clf3, para)

In [38]:
y_pred3_train = dt.predict(X_train)
y_pred3_test = dt.predict(X_test)

print("Classification Report for training data: \n", classification_report(y_train, y_pred3_train))
print("Classification Report for testing data: \n",classification_report(y_test, y_pred3_test))

Classification Report for training data: 
                             precision    recall  f1-score   support

     Bank Account services       1.00      1.00      1.00      3772
Credit card / Prepaid card       1.00      1.00      1.00      3260
  Mortgage/Loan management       1.00      1.00      1.00      3981
   Theft/Dispute Reporting       1.00      1.00      1.00      3930
                    others       1.00      1.00      1.00      1905

                  accuracy                           1.00     16848
                 macro avg       1.00      1.00      1.00     16848
              weighted avg       1.00      1.00      1.00     16848

Classification Report for testing data: 
                             precision    recall  f1-score   support

     Bank Account services       0.80      0.81      0.80       954
Credit card / Prepaid card       0.83      0.82      0.82       783
  Mortgage/Loan management       0.86      0.86      0.86      1052
   Theft/Dispute Reporting 

## 4. Random Forest Classifier

In [39]:
clf4 = RandomForestClassifier()
para = {
    'n_estimators': [50, 100, 150, 200, 300, 400, 500],
    'criterion' : ["gini", "entropy", "log_loss"],
    'max_depth' : [None, 2, 4, 6, 8, 10, 12, 14, 16],
    'min_samples_split': [2, 4, 6, 8, 10,15],
    'min_samples_leaf': [1, 2, 4, 6, 8, 10,12],
    'max_features' : [None, "sqrt", "log2"],
    'bootstrap' : [True, False],
    'class_weight' : [None, "balanced", "balanced_subsample"]
}
rf = model_build(clf4, para)

In [40]:
y_pred4_train = rf.predict(X_train)
y_pred4_test = rf.predict(X_test)

print("Classification Report for training data: \n", classification_report(y_train, y_pred4_train))
print("Classification Report for testing data: \n",classification_report(y_test, y_pred4_test))

Classification Report for training data: 
                             precision    recall  f1-score   support

     Bank Account services       1.00      1.00      1.00      3772
Credit card / Prepaid card       1.00      1.00      1.00      3260
  Mortgage/Loan management       1.00      1.00      1.00      3981
   Theft/Dispute Reporting       1.00      1.00      1.00      3930
                    others       1.00      1.00      1.00      1905

                  accuracy                           1.00     16848
                 macro avg       1.00      1.00      1.00     16848
              weighted avg       1.00      1.00      1.00     16848

Classification Report for testing data: 
                             precision    recall  f1-score   support

     Bank Account services       0.86      0.92      0.89       954
Credit card / Prepaid card       0.87      0.88      0.87       783
  Mortgage/Loan management       0.88      0.92      0.90      1052
   Theft/Dispute Reporting 

## 5. Gradient Boosting Classifier

In [56]:
clf5 = GradientBoostingClassifier()
para = {
    'n_estimators': [50, 100, 150, 200, 300, 400, 500],
    'loss' : ['log_loss', 'exponential'],
    'learning_rate' : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    'criterion' : ['friedman_mse', 'squared_error'],
    'min_samples_split': [2, 4, 6, 8, 10,15],
    'min_samples_leaf': [1, 2, 4, 6, 8, 10,12],
    'max_depth' : [None, 2, 4, 6, 8, 10, 12, 14, 16],
    'max_features' : [None, "sqrt", "log2"],
}
gb = model_build(clf5, para)

In [57]:
y_pred5_train = gb.predict(X_train)
y_pred5_test = gb.predict(X_test)

print("Classification Report for training data: \n", classification_report(y_train, y_pred5_train))
print("Classification Report for testing data: \n",classification_report(y_test, y_pred5_test))

Classification Report for training data: 
                             precision    recall  f1-score   support

     Bank Account services       0.97      0.96      0.96      3772
Credit card / Prepaid card       0.96      0.96      0.96      3260
  Mortgage/Loan management       0.97      0.96      0.97      3981
   Theft/Dispute Reporting       0.92      0.95      0.93      3930
                    others       0.98      0.95      0.96      1905

                  accuracy                           0.95     16848
                 macro avg       0.96      0.95      0.96     16848
              weighted avg       0.96      0.95      0.95     16848

Classification Report for testing data: 
                             precision    recall  f1-score   support

     Bank Account services       0.93      0.92      0.92       954
Credit card / Prepaid card       0.92      0.91      0.91       783
  Mortgage/Loan management       0.94      0.93      0.93      1052
   Theft/Dispute Reporting 

### By observing the reports of all the above models, we conclude that  logistic regression is better as compared to others with a weighted average F1 score of 97% on test data.

In [26]:
pickle.dump(lr, open('model.pkl', 'wb'))

# Prediction

In [28]:
import nltk, spacy, re
import string

In [29]:
def clean_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)

    text2=[]
    for i in text:
        if i.isalnum():
           text2.append(i)          #removing all special characters

    text=text2.copy()
    text2.clear()
    for i in text:
        if i not in string.punctuation:
            text2.append(i)

    return ' '.join(text2)

In [30]:
import swifter
import en_core_web_sm

# Load the English language model
nlp = spacy.load("en_core_web_sm")

def lemmatize_text(text):
    doc = nlp(text)
    # Lemmatize each token and remove stop words
    lemmatized_text = " ".join([token.lemma_ for token in doc if not token.is_stop])

    return lemmatized_text

In [31]:

def pos_tags(text):
  nn_words = []
  doc = nlp(text)
  for tok in doc:
      if (tok.tag_== 'NN'):
          nn_words.append(tok.lemma_)
  nn_words_str = " ".join(nn_words)
  return nn_words_str

In [55]:
model = pickle.load(open('model.pkl', 'rb'))
count = pickle.load(open('count.pkl', 'rb'))
tfidf = pickle.load(open('tfidf.pkl', 'rb'))

sentence = "I kindly request a full refund of the $50 service fee and an explanation of why this fee was charged. Additionally, I would appreciate confirmation that no further unauthorized charges will be applied to my account"

clean=pos_tags(lemmatize_text(clean_text(sentence)))
k = count.transform([clean])
h = tfidf.transform(k)
model.predict(h)[0]

'others'

In [45]:
pos_tags(lemmatize_text(clean_text(sentence)))

'money bank gift maintainance fee visit bank reason lunch time counter manager'