In [2]:

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel


#NLTK-------------------------------
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
#from nltk.stemporter import PorterStemmer

# Import libraries for feature 
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2


from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,classification_report
from sklearn import metrics
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings("ignore")


from google.colab import drive
drive.mount('/gdrive')
#Change current working directory to gdrive
%cd /gdrive


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Mounted at /gdrive
/gdrive


In [3]:
#Read files
textfile = r'/gdrive/My Drive/CIS508-FALL2020/PT5A/Comments.csv'
textData = pd.read_csv(textfile) #creates a dataframe

CustInfofile = r'/gdrive/My Drive/CIS508-FALL2020/PT5A/Customers.csv'
CustInfoData = pd.read_csv(CustInfofile)  #creates a dataframe

print(textData.shape)
print(CustInfoData.shape)


(2070, 2)
(2070, 17)


In [4]:
#Extract target column from Customer Info file
y_train = CustInfoData["TARGET"]
X_train = CustInfoData.drop(columns=["TARGET"]) #extracting training data without the target column
                     
print(X_train.shape)
print(textData.shape)
print(textData.head())
print(y_train)

(2070, 16)
(2070, 2)
     ID                                           Comments
0  1309  Does not like the way the phone works. It is t...
1  3556  Wanted to know the nearest store location. Wan...
2  2230  Wants to know how to do text messaging. Referr...
3  2312  Asked how to disable call waiting. referred hi...
4  3327  Needs help learning how to use the phone. I su...
0       Cancelled
1         Current
2         Current
3         Current
4       Cancelled
          ...    
2065    Cancelled
2066    Cancelled
2067    Cancelled
2068    Cancelled
2069    Cancelled
Name: TARGET, Length: 2070, dtype: object


In [5]:
#Tokenize - Split the sentences to lists of words
textData['CommentsTokenized'] = textData['Comments'].apply(word_tokenize)

export_csv = textData.to_csv(r'/gdrive/My Drive/CIS508-FALL2020/PT5A/TextDataTokenized1.csv')




In [6]:
# Use English stemmer.
stemmer = SnowballStemmer("english")

#Now do stemming - create a new dataframe to store stemmed version
newTextData=pd.DataFrame()
newTextData=textData.drop(columns=["CommentsTokenized","Comments"])
newTextData['CommentsTokenizedStemmed'] = textData['CommentsTokenized'].apply(lambda x: [stemmer.stem(y) for y in x]) # Stem every word.

export_csv = newTextData.to_csv(r'/gdrive/My Drive/CIS508-FALL2020/PT5A/newTextDataTS.csv')


In [7]:

#Join stemmed strings
newTextData['CommentsTokenizedStemmed'] = newTextData['CommentsTokenizedStemmed'].apply(lambda x: " ".join(x))

export_csv = newTextData.to_csv(r'/gdrive/My Drive/CIS508-FALL2020/PT5A/newTextData-Joined.csv')

In [8]:
#Do Bag-Of-Words model - Term - Document Matrix
#Learn the vocabulary dictionary and return term-document matrix.
#count_vect = CountVectorizer(stop_words=None)
count_vect = CountVectorizer(stop_words='english',lowercase=False)
TD_counts = count_vect.fit_transform(newTextData.CommentsTokenizedStemmed)
print(TD_counts.shape)
print(TD_counts.dtype)
print(count_vect.get_feature_names())
#print(TD_counts)
DF_TD_Counts=pd.DataFrame(TD_counts.toarray())
print(DF_TD_Counts)
export_csv = DF_TD_Counts.to_csv(r'/gdrive/My Drive/CIS508-FALL2020/PT5A/TD_counts-TokenizedStemmed.csv')


(2070, 354)
int64
['3399', '3g', 'abysm', 'access', 'accessori', 'adapt', 'add', 'addit', 'additon', 'address', 'adit', 'adress', 'advertis', 'afraid', 'alway', 'angel', 'angri', 'ani', 'anoth', 'anyth', 'anytim', 'area', 'asap', 'ask', 'bad', 'basic', 'bateri', 'batteri', 'becaus', 'believ', 'better', 'bigger', 'book', 'bought', 'brain', 'bring', 'built', 'busi', 'button', 'buy', 'cancel', 'cancer', 'car', 'care', 'carrier', 'caus', 'cc', 'cell', 'certain', 'chang', 'charg', 'charger', 'check', 'chip', 'citi', 'claim', 'cleariti', 'cold', 'comapr', 'compani', 'compar', 'competit', 'complain', 'complaint', 'concept', 'connect', 'consisit', 'consist', 'constan', 'contact', 'continu', 'contract', 'correct', 'cost', 'coupl', 'cover', 'coverag', 'creat', 'credit', 'cstmer', 'cstmr', 'current', 'cust', 'custom', 'customr', 'date', 'day', 'dead', 'decent', 'defect', 'deo', 'did', 'die', 'differ', 'difficult', 'digiti', 'direct', 'disabl', 'doe', 'don', 'dont', 'drop', 'dure', 'easier', 'effe

In [9]:
#Compute TF-IDF Matrix
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(TD_counts)
print(X_train_tfidf.shape)
DF_TF_IDF=pd.DataFrame(X_train_tfidf.toarray())
print(DF_TF_IDF)
export_csv= DF_TF_IDF.to_csv(r'/gdrive/My Drive/CIS508-FALL2020/PT5A/TFIDF_counts-TokenizedStemmed.csv')


(2070, 354)
      0    1    2    3        4    5    ...  348  349  350  351  352  353
0     0.0  0.0  0.0  0.0  0.00000  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
1     0.0  0.0  0.0  0.0  0.27568  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
2     0.0  0.0  0.0  0.0  0.00000  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
3     0.0  0.0  0.0  0.0  0.00000  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
4     0.0  0.0  0.0  0.0  0.00000  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
...   ...  ...  ...  ...      ...  ...  ...  ...  ...  ...  ...  ...  ...
2065  0.0  0.0  0.0  0.0  0.00000  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
2066  0.0  0.0  0.0  0.0  0.00000  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
2067  0.0  0.0  0.0  0.0  0.00000  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
2068  0.0  0.0  0.0  0.0  0.00000  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
2069  0.0  0.0  0.0  0.0  0.00000  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0

[2070 rows x 354 columns]


In [10]:
#Feature selection
new_DF_TF_IDF = SelectKBest(score_func=chi2, k=50).fit_transform(DF_TF_IDF,y_train)
print(new_DF_TF_IDF.shape)

DF_TF_IDF_SelectedFeatures= pd.DataFrame(new_DF_TF_IDF)
print(DF_TF_IDF_SelectedFeatures)

export_csv= DF_TF_IDF_SelectedFeatures.to_csv(r'/gdrive/My Drive/CIS508-FALL2020/PT5A/TFIDF_counts-Selected Features.csv')


(2070, 50)
            0    1    2    3         4         5   ...   44   45   46   47   48   49
0     0.000000  0.0  0.0  0.0  0.000000  0.000000  ...  0.0  0.0  0.0  0.0  0.0  0.0
1     0.000000  0.0  0.0  0.0  0.000000  0.000000  ...  0.0  0.0  0.0  0.0  0.0  0.0
2     0.000000  0.0  0.0  0.0  0.000000  0.000000  ...  0.0  0.0  0.0  0.0  0.0  0.0
3     0.000000  0.0  0.0  0.0  0.000000  0.000000  ...  0.0  0.0  0.0  0.0  0.0  0.0
4     0.000000  0.0  0.0  0.0  0.000000  0.000000  ...  0.0  0.0  0.0  0.0  0.0  0.0
...        ...  ...  ...  ...       ...       ...  ...  ...  ...  ...  ...  ...  ...
2065  0.000000  0.0  0.0  0.0  0.000000  0.446161  ...  0.0  0.0  0.0  0.0  0.0  0.0
2066  0.000000  0.0  0.0  0.0  0.000000  0.000000  ...  0.0  0.0  0.0  0.0  0.0  0.0
2067  0.000000  0.0  0.0  0.0  0.000000  0.000000  ...  0.0  0.0  0.0  0.0  0.0  0.0
2068  0.772949  0.0  0.0  0.0  0.545354  0.000000  ...  0.0  0.0  0.0  0.0  0.0  0.0
2069  0.000000  0.0  0.0  0.0  0.000000  0.000000  ...

In [11]:
#Construct a Random Forest Classifier on text data
clf=RandomForestClassifier()
RF_text = clf.fit(DF_TF_IDF_SelectedFeatures,y_train)
print("Accuracy score (training): {0:.6f}".format(clf.score(DF_TF_IDF_SelectedFeatures, y_train)))
rf_predictions = clf.predict(DF_TF_IDF_SelectedFeatures)
print("Confusion Matrix:")
print(confusion_matrix(y_train, rf_predictions))
print("Classification Report")
print(classification_report(y_train, rf_predictions))


Accuracy score (training): 0.633333
Confusion Matrix:
[[  92  712]
 [  47 1219]]
Classification Report
              precision    recall  f1-score   support

   Cancelled       0.66      0.11      0.20       804
     Current       0.63      0.96      0.76      1266

    accuracy                           0.63      2070
   macro avg       0.65      0.54      0.48      2070
weighted avg       0.64      0.63      0.54      2070



In [12]:
#run cross-validation - Text Data
clf_cv_score = cross_val_score(clf, DF_TF_IDF_SelectedFeatures, y_train, cv=20, scoring="balanced_accuracy")
print("=== All Accuracy Scores ===")
print(clf_cv_score)
print('\n')
print("=== Mean Accuracy Score ===")
print("Mean Accuracy Score - ON Text: ",clf_cv_score.mean())
print('\n')

=== All Accuracy Scores ===
[0.53290747 0.52129307 0.52071235 0.52497096 0.509375   0.559375
 0.54375    0.53125    0.509375   0.546875   0.54206349 0.52619048
 0.48869048 0.525      0.50119048 0.48869048 0.525      0.53869048
 0.48988095 0.50119048]


=== Mean Accuracy Score ===
Mean Accuracy Score - ON Text:  0.521323557878436




In [13]:
#Merge files
print(CustInfoData.shape)
X_train = CustInfoData.drop(columns=["TARGET"]) #extracting training data without the target column
print(X_train.shape)
##combined=X_train.merge(DF_TF_IDF_SelectedFeatures, left_on='ID', right_on='ID')
#combined=pd.merge(X_train, DF_TF_IDF_SelectedFeatures, how='left', on=['ID', 'ID'])
#combined = pd.merge(X_train, DF_TF_IDF_SelectedFeatures, how='left',on = 'ID')
#combined=pd.merge(X_train, DF_TF_IDF_SelectedFeatures, join ='inner', on='ID')

combined=pd.concat([X_train, DF_TF_IDF_SelectedFeatures], axis=1)
print(combined.shape)
print(combined)
export_csv= combined.to_csv(r'/gdrive/My Drive/CIS508-FALL2020/PT5A/Combined-Cust+TFIDF+SelectedFeatures.csv')



(2070, 17)
(2070, 16)
(2070, 66)
        ID Sex Status  Children  Est_Income  ...   45   46   47   48   49
0        1   F      S         1    38000.00  ...  0.0  0.0  0.0  0.0  0.0
1        6   M      M         2    29616.00  ...  0.0  0.0  0.0  0.0  0.0
2        8   M      M         0    19732.80  ...  0.0  0.0  0.0  0.0  0.0
3       11   M      S         2       96.33  ...  0.0  0.0  0.0  0.0  0.0
4       14   F      M         2    52004.80  ...  0.0  0.0  0.0  0.0  0.0
...    ...  ..    ...       ...         ...  ...  ...  ...  ...  ...  ...
2065  3821   F      S         0    78851.30  ...  0.0  0.0  0.0  0.0  0.0
2066  3822   F      S         1    17540.70  ...  0.0  0.0  0.0  0.0  0.0
2067  3823   F      M         0    83891.90  ...  0.0  0.0  0.0  0.0  0.0
2068  3824   F      M         2    28220.80  ...  0.0  0.0  0.0  0.0  0.0
2069  3825   F      S         0    28589.10  ...  0.0  0.0  0.0  0.0  0.0

[2070 rows x 66 columns]


In [14]:
#Do one Hot encoding for categorical features
X_cat = ["Sex","Status","Car_Owner","Paymethod","LocalBilltype","LongDistanceBilltype"]
#X_cat = combined.select_dtypes(exclude=['int','float64'])
print(X_cat)
combined_one_hot = pd.get_dummies(combined,columns=X_cat)
print(combined_one_hot.shape)
export_csv= combined_one_hot.to_csv(r'/gdrive/My Drive/CIS508-FALL2020/PT5A/combined_one_hot.csv')



['Sex', 'Status', 'Car_Owner', 'Paymethod', 'LocalBilltype', 'LongDistanceBilltype']
(2070, 74)


In [15]:
#Construct a Random Forest Classifier on combined data
#clf1=RandomForestClassifier()
RF_Comb = clf.fit(combined_one_hot,y_train)
print("Accuracy score (training): {0:.6f}".format(clf.score(combined_one_hot, y_train)))
rf_predictions = clf.predict(combined_one_hot)
print("Confusion Matrix:")
print(confusion_matrix(y_train, rf_predictions))
print("Classification Report")
print(classification_report(y_train, rf_predictions))



Accuracy score (training): 1.000000
Confusion Matrix:
[[ 804    0]
 [   0 1266]]
Classification Report
              precision    recall  f1-score   support

   Cancelled       1.00      1.00      1.00       804
     Current       1.00      1.00      1.00      1266

    accuracy                           1.00      2070
   macro avg       1.00      1.00      1.00      2070
weighted avg       1.00      1.00      1.00      2070



In [16]:
#run cross-validation - COMBINED Data
rf_Comb_cv_score = cross_val_score(RF_Comb, combined_one_hot, y_train, cv=20, scoring="balanced_accuracy")
print("=== All Accuracy Scores ===")
print(rf_Comb_cv_score)
print('\n')
print("=== Mean Accuracy Score ===")
print("Mean Accuracy Score - ON Text: ",rf_Comb_cv_score.mean())
print('\n')

=== All Accuracy Scores ===
[0.81939605 0.85907859 0.84204413 0.91153697 0.865625   0.921875
 0.83125    0.815625   0.8578125  0.8890625  0.87162698 0.89325397
 0.94206349 0.8047619  0.89325397 0.81031746 0.93075397 0.93869048
 0.87738095 0.91031746]


=== Mean Accuracy Score ===
Mean Accuracy Score - ON Text:  0.8742863192024778




In [17]:
#Construct a Random Forest Classifier WITHOUT text data
print(CustInfoData.shape)
X_train1=combined_one_hot.iloc[:,1:10]
X_train2=combined_one_hot.iloc[:,60:]
print(X_train1.shape)
print(X_train1.head())
print(X_train2.shape)
print(X_train2.head())
combined1=pd.concat([X_train1, X_train2], axis=1)
print(combined1.shape)
print(combined1.head())


(2070, 17)
(2070, 9)
   Children  Est_Income   Usage  ...  International   Local  Dropped
0         1    38000.00  229.64  ...            0.0  206.08        0
1         2    29616.00   75.29  ...            0.0   45.50        0
2         0    19732.80   47.25  ...            0.0   22.44        0
3         2       96.33   59.01  ...            0.0   32.88        1
4         2    52004.80   28.14  ...            0.0   23.11        0

[5 rows x 9 columns]
(2070, 14)
   Sex_F  ...  LongDistanceBilltype_Standard
0      1  ...                              0
1      0  ...                              1
2      0  ...                              1
3      0  ...                              1
4      1  ...                              0

[5 rows x 14 columns]
(2070, 23)
   Children  ...  LongDistanceBilltype_Standard
0         1  ...                              0
1         2  ...                              1
2         0  ...                              1
3         2  ...                    

In [18]:
#Construct a Random Forest Classifier WITHOUT text data

rf_NT=clf.fit(combined1,y_train)
print("Accuracy score (training): {0:.6f}".format(rf_NT.score(combined1, y_train)))
rf_predictions = rf_NT.predict(combined1)
print("Confusion Matrix:")
print(confusion_matrix(y_train, rf_predictions))
print("Classification Report")
print(classification_report(y_train, rf_predictions))


Accuracy score (training): 0.957488
Confusion Matrix:
[[ 765   39]
 [  49 1217]]
Classification Report
              precision    recall  f1-score   support

   Cancelled       0.94      0.95      0.95       804
     Current       0.97      0.96      0.97      1266

    accuracy                           0.96      2070
   macro avg       0.95      0.96      0.96      2070
weighted avg       0.96      0.96      0.96      2070



In [19]:
#run cross-validation - WITHOUT Text Data
rf_NT_cv_score = cross_val_score(rf_NT, combined1, y_train, cv=20, scoring="balanced_accuracy")
print("=== All Accuracy Scores ===")
print(rf_NT_cv_score)
print('\n')
print("=== Mean Accuracy Score ===")
print("Mean Accuracy Score - WITHOUT Text: ",rf_NT_cv_score.mean())
print('\n')


=== All Accuracy Scores ===
[0.84436702 0.87979094 0.87069299 0.91153697 0.878125   0.9015625
 0.83125    0.803125   0.8796875  0.89375    0.92619048 0.90119048
 0.93075397 0.82857143 0.91031746 0.79781746 0.91488095 0.93869048
 0.87738095 0.91825397]


=== Mean Accuracy Score ===
Mean Accuracy Score - WITHOUT Text:  0.8818967770034843




In [20]:
#Customer Info One-Hot Encoded
DF_Combined1= pd.DataFrame(combined1)
export_csv= DF_Combined1.to_csv(r'/gdrive/My Drive/CIS508-FALL2020/PT5A/CustInfo_Onehot_encoded.csv')

In [21]:
#Do feature selection using a classification model
#clf = ExtraTreesClassifier(n_estimators=50)
#clf = GradientBoostingClassifier(n_estimators=50)
clf = DecisionTreeClassifier()
clf = clf.fit(DF_Combined1,y_train)
print(clf.feature_importances_)
#model = SelectFromModel(clf, prefit=True)
model = SelectFromModel(clf, prefit=True, max_features=7, threshold=-np.inf)
#model = SelectFromModel(clf, prefit=True)
X_new = model.transform(DF_Combined1)
X_new_SelectedFeatures= pd.DataFrame(X_new)
export_csv= X_new_SelectedFeatures.to_csv(r'/gdrive/My Drive/CIS508-FALL2020/PT5A/X_new_SelectedFeatures.csv')

print(model.get_support())
print(X_new_SelectedFeatures)
#print(X_new_SelectedFeatures.shape)
#print(X_new_SelectedFeatures.head())


[0.13019219 0.14573791 0.05263397 0.13041509 0.0821037  0.09947122
 0.04198099 0.07487738 0.00203612 0.03098145 0.0281816  0.00038269
 0.00805136 0.07379821 0.00996289 0.00353483 0.04542684 0.00541686
 0.01242187 0.00293277 0.00830286 0.00848848 0.00266871]
[ True  True False  True  True  True False  True False False False False
 False  True False False False False False False False False False]
        0         1          2    3      4       5    6
0     1.0  38000.00  24.393333  3.0  23.56  206.08  1.0
1     2.0  29616.00  49.426667  2.0  29.78   45.50  0.0
2     0.0  19732.80  50.673333  3.0  24.81   22.44  0.0
3     2.0     96.33  56.473333  1.0  26.13   32.88  1.0
4     2.0  52004.80  25.140000  1.0   5.03   23.11  0.0
...   ...       ...        ...  ...    ...     ...  ...
2065  0.0  78851.30  48.373333  4.0   0.37   28.66  1.0
2066  1.0  17540.70  62.786667  1.0  22.17   13.45  1.0
2067  0.0  83891.90  61.020000  4.0  28.92   45.47  0.0
2068  2.0  28220.80  38.766667  4.0  26.4

In [46]:
#Sequential Forward Search
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

clf = DecisionTreeClassifier()

sfs1 = SFS(clf, 
           k_features=7, 
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='accuracy',
           cv=0)

sfs1 = sfs1.fit(DF_Combined1,y_train)
sfs1.subsets_


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  23 out of  23 | elapsed:    0.1s finished

[2020-10-22 05:44:13] Features: 1/7 -- score: 0.8985507246376812[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  22 out of  22 | elapsed:    0.1s finished

[2020-10-22 05:44:13] Features: 2/7 -- score: 0.9338164251207729[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  21 out of  21 | elapsed:    0.2s finished

[2020-10-22 05:44:14] Features: 3/7 -- score: 0.9502415458937198[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 

{1: {'avg_score': 0.8985507246376812,
  'cv_scores': array([0.89855072]),
  'feature_idx': (2,),
  'feature_names': ('Usage',)},
 2: {'avg_score': 0.9338164251207729,
  'cv_scores': array([0.93381643]),
  'feature_idx': (2, 4),
  'feature_names': ('Usage', 'RatePlan')},
 3: {'avg_score': 0.9502415458937198,
  'cv_scores': array([0.95024155]),
  'feature_idx': (2, 3, 4),
  'feature_names': ('Usage', 'Age', 'RatePlan')},
 4: {'avg_score': 0.9541062801932367,
  'cv_scores': array([0.95410628]),
  'feature_idx': (2, 3, 4, 5),
  'feature_names': ('Usage', 'Age', 'RatePlan', 'LongDistance')},
 5: {'avg_score': 0.9565217391304348,
  'cv_scores': array([0.95652174]),
  'feature_idx': (0, 2, 3, 4, 5),
  'feature_names': ('Children', 'Usage', 'Age', 'RatePlan', 'LongDistance')},
 6: {'avg_score': 0.9570048309178744,
  'cv_scores': array([0.95700483]),
  'feature_idx': (0, 1, 2, 3, 4, 5),
  'feature_names': ('Children',
   'Est_Income',
   'Usage',
   'Age',
   'RatePlan',
   'LongDistance')},
 7

In [47]:
#Sequential forward search result
print(sfs1.k_feature_names_)
print(sfs1.k_score_)

('Children', 'Est_Income', 'Usage', 'Age', 'RatePlan', 'LongDistance', 'Paymethod_CC')
0.957487922705314
