In [5]:
import numpy as np
import pandas as pd

In [6]:
from io import StringIO
ndf = pd.read_csv('Cleaned-P&P data.csv')
blanks = []  # start with an empty list`
ndf.columns = ['classx','desc','inc_num','short_desc']
for i,c,d,inc,sd in ndf.itertuples():  # iterate over the DataFrame
        if c == 'O' or c=='E':         # test 'review' for whitespace
            blanks.append(i)     # add matching index numbers to the list
            
ndf.drop(blanks, inplace=True)
col = ['classx', 'desc']
ndf = ndf[col]
ndf.columns = ['classx', 'desc']
ndf['category_id'] = ndf['classx'].factorize()[0]
category_id_df = ndf[['classx', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'classx']].values)
ndf.head()

Unnamed: 0,classx,desc,category_id
0,P,"good afternoon,\r\n\r\nthis customer is being ...",0
1,R,please can you remove the delivery pass be rem...,1
3,R,please can the delivery pass be removed from t...,1
4,P,"customer: d4957244\r\nwebsite:jd williams, mai...",0
5,P,short description:tamara called in to report t...,0


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
#, stop_words='english'
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2))
features = tfidf.fit_transform(ndf.desc).toarray()
labels = ndf.category_id
features.shape

(244, 426)

In [8]:
from sklearn.feature_selection import chi2
import numpy as np
N = 2
for classx, category_id in sorted(category_to_id.items()):
  features_chi2 = chi2(features, labels == category_id)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
  print("# '{}':".format(classx))
  print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
  print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))

# 'P':
  . Most correlated unigrams:
. from
. staff
  . Most correlated bigrams:
. be removed
. staff accounts
# 'R':
  . Most correlated unigrams:
. from
. staff
  . Most correlated bigrams:
. be removed
. staff accounts


In [17]:
from sklearn.model_selection import train_test_split

X = ndf['desc']  # this time we want to look at the text
y = ndf['classx']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [10]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape
features.shape

(244, 426)

In [11]:
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(features,labels)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

# Linear SVC:
text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])

In [13]:
from sklearn.model_selection import train_test_split

X = ndf['desc']
y = ndf['classx']

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.25, random_state=42)

In [18]:

X_train = np.array(X_train).tolist()
y_train = np.array(y_train).tolist()
X_train = list(map(''.join, X_train)) 
y_train = list(map(''.join, y_train)) 

In [19]:
text_clf_lsvc.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
X_train, X_test, y_train, y_test = train_test_split(ndf['desc'], ndf['classx'], random_state = 42)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [19]:
print(clf.predict(count_vect.transform(X_test)))

['R' 'R' 'P' 'P' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'P' 'P' 'P' 'P' 'P'
 'P' 'R' 'P' 'R' 'R' 'P' 'R' 'P' 'P' 'P' 'P' 'P' 'P' 'R' 'P' 'P' 'P' 'P'
 'P' 'P' 'P' 'P' 'P' 'P' 'R' 'P' 'P' 'P' 'P' 'P' 'R' 'R' 'R' 'R' 'P' 'R'
 'P' 'P' 'P' 'R' 'P' 'R' 'P']


In [20]:
clf.predict_proba(count_vect.transform(X_test[:5]))

array([[1.62405149e-10, 1.00000000e+00],
       [1.18095968e-07, 9.99999882e-01],
       [1.00000000e+00, 3.25467839e-13],
       [7.43739943e-01, 2.56260057e-01],
       [1.47319575e-09, 9.99999999e-01]])

In [37]:
# Form a prediction set
predictions = clf.predict(count_vect.transform(X))
from sklearn import metrics
print(metrics.classification_report(y,predictions))
# Report the confusion matrix

print(metrics.confusion_matrix(y,predictions))

              precision    recall  f1-score   support

           P       0.99      0.97      0.98       149
           R       0.96      0.99      0.97        95

   micro avg       0.98      0.98      0.98       244
   macro avg       0.98      0.98      0.98       244
weighted avg       0.98      0.98      0.98       244

[[145   4]
 [  1  94]]


In [41]:
Stri = ['this account is still getting free delivery remove the delivery pass on this account']
print(clf.predict(count_vect.transform(Stri)))
clf.predict_proba(count_vect.transform(Stri))

['P']


array([[0.950619, 0.049381]])

ValueError: Iterable over raw text documents expected, string object received.