In [74]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
from textblob import TextBlob


In [75]:
# Load Bank tablular data
csv_path = 'bank-data/bank-tabular.csv'
df = pd.read_csv(csv_path, sep='\t')

df


Unnamed: 0,customer_id,date,customer_gender,customer_age,customer_location,customer_type,has_cc,has_mortgage,convenience,customer_service,online_banking,interest_rates,fees_charges,community_involvement,products_services,privacy_security,reputation,satisfied
0,216604,2022-08-22,Male,50.0,Munster,Personal,True,False,4.0,5.0,4.0,4.0,4.0,4.0,5.0,2.0,4.0,True
1,259276,2022-11-23,Female,61.0,Leinster,Personal,True,False,5.0,5.0,5.0,3.0,5.0,4.0,4.0,5.0,5.0,True
2,265459,2022-01-21,Female,63.0,Munster,Business,True,False,2.0,2.0,5.0,5.0,2.0,,4.0,4.0,,True
3,58770,2022-03-13,f,,Leinster,Business,True,False,,4.0,5.0,1.0,5.0,5.0,5.0,5.0,5.0,True
4,318031,2022-08-08,Female,41.0,Leinster,Personal,True,True,1.0,1.0,1.0,1.0,2.0,2.0,4.0,5.0,2.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,322582,2021-09-23,Male,41.0,Munster,Personal,True,True,3.0,3.0,3.0,3.0,5.0,3.0,3.0,1.0,5.0,False
2996,53418,2021-03-07,f,57.0,Munster,Business,False,False,3.0,2.0,5.0,1.0,2.0,2.0,2.0,2.0,3.0,True
2997,79364,2021-08-01,m,,Munster,Personal,True,True,3.0,3.0,3.0,4.0,4.0,3.0,4.0,4.0,4.0,False
2998,371134,2021-06-25,m,42.0,Leinster,Business,False,False,3.0,2.0,1.0,5.0,4.0,4.0,3.0,4.0,1.0,True


In [76]:
# Load Bank comments data
csv_path = 'bank-data/bank-comments.csv'
df = pd.read_csv(csv_path, sep='\t')

df


Unnamed: 0,customer_id,date,comments
0,216604,2022-08-22,"Overal, this bank is satisfactory."
1,259276,2022-11-23,Easy to find zhe bank ' s branches and ATMs. A...
2,265459,2022-01-21,Bank's phone app is really great. In general a...
3,58770,2022-03-13,
4,318031,2022-08-08,
...,...,...,...
2995,322582,2021-09-23,No comment
2996,53418,2021-03-07,Online banking is really good
2997,79364,2021-08-01,customer service quality from this bank is ter...
2998,371134,2021-06-25,Great to see that my bank supports local sport...


## Bank comments clean up

In [77]:
# Clean data dealing with NaN's
# Enum satisfaction = [bad, neutral, good, unknown]

# def get_satisfaction(comments):
#     if pd.isna(comments):
#         return "neutral"
#     else:
#         return "unknown"

# apply function to DataFrame and assign result to new column
# df['satisfaction'] = df['comments'].apply(get_satisfaction)jj

#  drop NaN's
df['comments'] = df['comments'].fillna('neutral')

# print resulting DataFrame
df

Unnamed: 0,customer_id,date,comments
0,216604,2022-08-22,"Overal, this bank is satisfactory."
1,259276,2022-11-23,Easy to find zhe bank ' s branches and ATMs. A...
2,265459,2022-01-21,Bank's phone app is really great. In general a...
3,58770,2022-03-13,neutral
4,318031,2022-08-08,neutral
...,...,...,...
2995,322582,2021-09-23,No comment
2996,53418,2021-03-07,Online banking is really good
2997,79364,2021-08-01,customer service quality from this bank is ter...
2998,371134,2021-06-25,Great to see that my bank supports local sport...


In [78]:

# sentence = "I hate not like pizza"
# blob = TextBlob(sentence)

# if blob.sentiment.polarity > 0:
#     print("Positive sentiment")
# elif blob.sentiment.polarity == 0:
#     print("Neutral sentiment")
# else:
#     print("Negative sentiment")

# Iterate over the rows of the dataframe
for index, row in df.iterrows():
    # Get the comment from the current row
    comment = row["comments"]
    
    # Analyze the sentiment of the comment using TextBlob
    blob = TextBlob(comment)
    sentiment_score = blob.sentiment.polarity
    
    # Add the sentiment score to the "satisfaction" column of the current row
    df.at[index, "satisfaction"] = sentiment_score


In [79]:
df

Unnamed: 0,customer_id,date,comments,satisfaction
0,216604,2022-08-22,"Overal, this bank is satisfactory.",0.000000
1,259276,2022-11-23,Easy to find zhe bank ' s branches and ATMs. A...,0.616667
2,265459,2022-01-21,Bank's phone app is really great. In general a...,0.516667
3,58770,2022-03-13,neutral,0.000000
4,318031,2022-08-08,neutral,0.000000
...,...,...,...,...
2995,322582,2021-09-23,No comment,0.000000
2996,53418,2021-03-07,Online banking is really good,0.700000
2997,79364,2021-08-01,customer service quality from this bank is ter...,-0.666667
2998,371134,2021-06-25,Great to see that my bank supports local sport...,0.360000


In [80]:
# check distribution of classes
print(df['satisfaction'].value_counts())

# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['comments'], df['satisfaction'], random_state=0)

# transform text data into numerical features using bag of words approach
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

# train and evaluate models
models = [
    ('naive bayes', MultinomialNB()),
    ('logistic regression', LogisticRegression(random_state=0)),
    ('linear SVM', LinearSVC(random_state=0))
]

for name, clf in models:
    # evaluate model using cross-validation
    scores = cross_val_score(clf, X_train_counts, y_train, cv=5)
    print(f"{name} cross-validation accuracy: {scores.mean():.2f}")
    
    # tune hyperparameters using grid search
    if name == 'logistic regression':
        param_grid = {'C': [0.1, 1, 10]}
        grid_search = GridSearchCV(clf, param_grid, cv=5)
        grid_search.fit(X_train_counts, y_train)
        print(f"Best hyperparameters for {name}: {grid_search.best_params_}")
        clf = grid_search.best_estimator_
        
    # train final model on full training set
    clf.fit(X_train_counts, y_train)
    
    # evaluate model on test set
    y_pred = clf.predict(X_test_counts)
    score = accuracy_score(y_test, y_pred)
    print(f"{name} test accuracy: {score:.2f}")
    print(classification_report(y_test, y_pred))

 0.000000    1073
 0.500000      92
 0.250000      81
 0.200000      68
-0.400000      57
             ... 
 0.385417       1
 0.422222       1
-0.265000       1
 0.370833       1
 0.360000       1
Name: satisfaction, Length: 507, dtype: int64


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/homebrew/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/homebrew/lib/python3.10/site-packages/sklearn/naive_bayes.py", line 753, in fit
    Y = labelbin.fit_transform(y)
  File "/opt/homebrew/lib/python3.10/site-packages/sklearn/utils/_set_output.py", line 140, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "/opt/homebrew/lib/python3.10/site-packages/sklearn/preprocessing/_label.py", line 334, in fit_transform
    return self.fit(y).transform(y)
  File "/opt/homebrew/lib/python3.10/site-packages/sklearn/preprocessing/_label.py", line 311, in fit
    self.classes_ = unique_labels(y)
  File "/opt/homebrew/lib/python3.10/site-packages/sklearn/utils/multiclass.py", line 107, in unique_labels
    raise ValueError("Unknown label type: %s" % repr(ys))
ValueError: Unknown label type: (array([0.  , 0.  , 0.  , ..., 0.  , 0.35, 0.  ]),)

--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/homebrew/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/homebrew/lib/python3.10/site-packages/sklearn/naive_bayes.py", line 753, in fit
    Y = labelbin.fit_transform(y)
  File "/opt/homebrew/lib/python3.10/site-packages/sklearn/utils/_set_output.py", line 140, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "/opt/homebrew/lib/python3.10/site-packages/sklearn/preprocessing/_label.py", line 334, in fit_transform
    return self.fit(y).transform(y)
  File "/opt/homebrew/lib/python3.10/site-packages/sklearn/preprocessing/_label.py", line 311, in fit
    self.classes_ = unique_labels(y)
  File "/opt/homebrew/lib/python3.10/site-packages/sklearn/utils/multiclass.py", line 107, in unique_labels
    raise ValueError("Unknown label type: %s" % repr(ys))
ValueError: Unknown label type: (array([ 0.16666667,  0.35      , -0.25      , ...,  0.        ,
        0.35      ,  0.        ]),)

--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/homebrew/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/homebrew/lib/python3.10/site-packages/sklearn/naive_bayes.py", line 753, in fit
    Y = labelbin.fit_transform(y)
  File "/opt/homebrew/lib/python3.10/site-packages/sklearn/utils/_set_output.py", line 140, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "/opt/homebrew/lib/python3.10/site-packages/sklearn/preprocessing/_label.py", line 334, in fit_transform
    return self.fit(y).transform(y)
  File "/opt/homebrew/lib/python3.10/site-packages/sklearn/preprocessing/_label.py", line 311, in fit
    self.classes_ = unique_labels(y)
  File "/opt/homebrew/lib/python3.10/site-packages/sklearn/utils/multiclass.py", line 107, in unique_labels
    raise ValueError("Unknown label type: %s" % repr(ys))
ValueError: Unknown label type: (array([ 0.16666667,  0.35      , -0.25      , ...,  0.5       ,
        0.        ,  0.        ]),)
