In [None]:
import pandas as pd
df = pd.read_csv('new_dbSquiggly.csv', delimiter='~')
df.head()

In [None]:
df = df[pd.notnull(df['v1_comment'])]

In [None]:
df.info()

In [None]:
col = ['Satd_type', 'v1_comment']
df = df[col]

In [None]:
df.columns

In [None]:
df.columns = ['Satd_type', 'v1_comment']

In [None]:
df['category_id'] = df['Satd_type'].factorize()[0]
from io import StringIO
category_id_df = df[['Satd_type', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'Satd_type']].values)

In [None]:
df.head()

In [None]:
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(8,6))
df.groupby('Satd_type').v1_comment.count().plot.bar(ylim=0)
plt.show()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

features = tfidf.fit_transform(df.v1_comment).toarray()
labels = df.category_id
features.shape

In [None]:
from sklearn.feature_selection import chi2
import numpy as np

N = 2
for Satd_type, category_id in sorted(category_to_id.items()):
  features_chi2 = chi2(features, labels == category_id)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
  print("# '{}':".format(Satd_type))
  print("  . Most correlated unigrams:\n       . {}".format('\n       . '.join(unigrams[-N:])))
  print("  . Most correlated bigrams:\n       . {}".format('\n       . '.join(bigrams[-N:])))

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

X_train, X_test, y_train, y_test = train_test_split(df['v1_comment'], df['Satd_type'], random_state = 0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [None]:
#print(clf.predict(count_vect.transform(["This company refuses to provide me verification and validation of debt per my right under the FDCPA. I do not believe this debt is mine."])))

In [None]:
#print(clf.predict(count_vect.transform(["I am disputing the inaccurate information the Chex-Systems has on my credit report. I initially submitted a police report on XXXX/XXXX/16 and Chex Systems only deleted the items that I mentioned in the letter and not all the items that were actually listed on the police report. In other words they wanted me to say word for word to them what items were fraudulent. The total disregard of the police report and what accounts that it states that are fraudulent. If they just had paid a little closer attention to the police report I would not been in this position now and they would n't have to research once again. I would like the reported information to be removed : XXXX XXXX XXXX"])))

In [None]:
#df[df['Consumer_complaint_narrative'] == "This company refuses to provide me verification and validation of debt per my right under the FDCPA. I do not believe this debt is mine."]

In [None]:
#df[df['Consumer_complaint_narrative'] == "I am disputing the inaccurate information the Chex-Systems has on my credit report. I initially submitted a police report on XXXX/XXXX/16 and Chex Systems only deleted the items that I mentioned in the letter and not all the items that were actually listed on the police report. In other words they wanted me to say word for word to them what items were fraudulent. The total disregard of the police report and what accounts that it states that are fraudulent. If they just had paid a little closer attention to the police report I would not been in this position now and they would n't have to research once again. I would like the reported information to be removed : XXXX XXXX XXXX"]

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

from sklearn.model_selection import cross_val_score


models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

In [None]:
import seaborn as sns

sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
              size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()

In [None]:
cv_df.groupby('model_name').accuracy.mean()

In [None]:
from sklearn.model_selection import train_test_split

model = RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0)

X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, df.index, test_size=0.33, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix

conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(8,6))
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=category_id_df.Satd_type.values, yticklabels=category_id_df.Satd_type.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
from IPython.display import display

for predicted in category_id_df.category_id:
  for actual in category_id_df.category_id:
    if predicted != actual and conf_mat[actual, predicted] >= 6:
      print("'{}' predicted as '{}' : {} examples.".format(id_to_category[actual], id_to_category[predicted], conf_mat[actual, predicted]))
      display(df.loc[indices_test[(y_test == actual) & (y_pred == predicted)]][['Satd_type', 'v1_comment']])
      print('')

In [None]:
model.fit(features, labels)

In [None]:
# from sklearn.feature_selection import chi2

# N = 2
# for Satd_type, category_id in sorted(category_to_id.items()):
#   indices = np.argsort(model.coef_[category_id])
#   feature_names = np.array(tfidf.get_feature_names())[indices]
#   unigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 1][:N]
#   bigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 2][:N]
#   print("# '{}':".format(Satd_type))
#   print("  . Top unigrams:\n       . {}".format('\n       . '.join(unigrams)))
#   print("  . Top bigrams:\n       . {}".format('\n       . '.join(bigrams)))

In [None]:
texts = ["FIXME throw assertion exception if constraintValidatorType == null",
         "TODO order by sum tag",
         "TODO remove the part with ToolchainManager lookup once we depend on 3.0.9 (have it as prerequisite). Define as regular component field then.",
         "I have no money to pay my tuition",
         "Coinbase closed my account for no reason and furthermore refused to give me a reason despite dozens of request"]
         
text_features = tfidf.transform(texts)
predictions = model.predict(text_features)
for text, predicted in zip(texts, predictions):
  print('"{}"'.format(text))
  print("  - Predicted as: '{}'".format(id_to_category[predicted]))
  print("")

"FIXME throw assertion exception if constraintValidatorType == null"
  - Predicted as: 'SATD_REMOVED'

"TODO order by sum tag"
  - Predicted as: 'SATD_REMOVED'

"TODO remove the part with ToolchainManager lookup once we depend on 3.0.9 (have it as prerequisite). Define as regular component field then."
  - Predicted as: 'SATD_REMOVED'

"I have no money to pay my tuition"
  - Predicted as: 'SATD_REMOVED'

"Coinbase closed my account for no reason and furthermore refused to give me a reason despite dozens of request"
  - Predicted as: 'SATD_REMOVED'



In [None]:
from sklearn import metrics
print(metrics.classification_report(y_test, y_pred, 
                                    target_names=df['Satd_type'].unique()))

                         precision    recall  f1-score   support

           SATD_CHANGED       0.00      0.00      0.00        12
           SATD_REMOVED       0.75      1.00      0.86       180
CLASS_OR_METHOD_CHANGED       0.00      0.00      0.00        26
      FILE_PATH_CHANGED       0.00      0.00      0.00        22
        SATD_MOVED_FILE       0.00      0.00      0.00         1

               accuracy                           0.75       241
              macro avg       0.15      0.20      0.17       241
           weighted avg       0.56      0.75      0.64       241



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
