In [0]:
import pandas as pd
import numpy as np

import nltk
nltk.download('punkt')
# word tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
#stemmers
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
#lematize
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('wordnet')
#term vector and tfidf
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# Import libraries for feature selection - Filter method
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

#traintest split
from sklearn.model_selection import train_test_split

#Models
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,balanced_accuracy_score
from sklearn import metrics

pd.set_option('display.max_colwidth', -2)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [0]:
from google.colab import drive
drive.mount('/gdrive')
#Change current working directory to gdrive
%cd /gdrive

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive


In [0]:
#Read files
textfile = r'/gdrive/My Drive/CIS 508 Python/Assignment-6/Comments.csv'
textData = pd.read_csv(textfile) #creates a dataframe

CustInfofile = r'/gdrive/My Drive/CIS 508 Python/Assignment-6/Customers.csv'
CustInfoData = pd.read_csv(CustInfofile)  #creates a dataframe

print(textData.shape)
print(CustInfoData.shape)
textData

(2070, 2)
(2070, 17)


Unnamed: 0,ID,Comments
0,1309,Does not like the way the phone works. It is to difficult compared to his last phone.
1,3556,Wanted to know the nearest store location. Wants to buy aditional accessories.
2,2230,Wants to know how to do text messaging. Referred him to website.
3,2312,Asked how to disable call waiting. referred him to web site.
4,3327,Needs help learning how to use the phone. I suggested he go back to the store and have the rep teach him.
...,...,...
2065,3034,Needed help figuring out his bill. I explained our minute charges.
2066,271,He lost his phone and called to cancel service. I told him we would suspend until we hear back from him. He will contact us soon.
2067,783,Lost the directions to phone and wants another manual. I referred him to web site.
2068,1295,Wants to change address.


In [0]:
#Extract target column from Customer Info file
y_train = CustInfoData["TARGET"]
X_train = CustInfoData.drop(columns=["TARGET"]) #extracting training data without the target column
                     
print(X_train.shape)
print(textData.shape)
textData.head()
print(y_train)

(2070, 16)
(2070, 2)
0       Cancelled
1       Current  
2       Current  
3       Current  
4       Cancelled
          ...    
2065    Cancelled
2066    Cancelled
2067    Cancelled
2068    Cancelled
2069    Cancelled
Name: TARGET, Length: 2070, dtype: object


# Word Tokenize

In [0]:
#tokenise the sentences into words
textData['CommentsTokenized'] = textData['Comments'].apply(word_tokenize)
print(textData['CommentsTokenized'])

0       [Does, not, like, the, way, the, phone, works, ., It, is, to, difficult, compared, to, his, last, phone, .]                                                       
1       [Wanted, to, know, the, nearest, store, location, ., Wants, to, buy, aditional, accessories, .]                                                                   
2       [Wants, to, know, how, to, do, text, messaging, ., Referred, him, to, website, .]                                                                                 
3       [Asked, how, to, disable, call, waiting, ., referred, him, to, web, site, .]                                                                                      
4       [Needs, help, learning, how, to, use, the, phone, ., I, suggested, he, go, back, to, the, store, and, have, the, rep, teach, him, .]                              
                                                                        ...                                                                      

# Snowball Stemming

In [0]:
#Snowball stemming
# Use English stemmer.
stemmer = SnowballStemmer("english")
newTextData=pd.DataFrame()
newTextData=textData.drop(columns=["CommentsTokenized","Comments"])
newTextData['CommentsTokenizedStemmedSnowball'] = textData['CommentsTokenized'].apply(lambda x: [stemmer.stem(y) for y in x])
print('-------------Snowball Stemming------------------')
newTextData['CommentsTokenizedStemmedSnowball']
#Join stemmed strings
newTextData['CommentsTokenizedStemmedSnowball'] = newTextData['CommentsTokenizedStemmedSnowball'].apply(lambda x: " ".join(x))
newTextData['CommentsTokenizedStemmedSnowball']

-------------Snowball Stemming------------------


0       doe not like the way the phone work . it is to difficult compar to his last phone .                                              
1       want to know the nearest store locat . want to buy adit accessori .                                                              
2       want to know how to do text messag . refer him to websit .                                                                       
3       ask how to disabl call wait . refer him to web site .                                                                            
4       need help learn how to use the phone . i suggest he go back to the store and have the rep teach him .                            
                                                        ...                                                                              
2065    need help figur out his bill . i explain our minut charg .                                                                       
2066    he lost his phone and call

# Creating Vector - Snowball

In [0]:
#Do Bag-Of-Words model - Term - Document Matrix - Snowball
#Learn the vocabulary dictionary and return term-document matrix.
#count_vect = CountVectorizer(stop_words=None)
count_vect = CountVectorizer(stop_words='english',lowercase=False)
count_vect
TD_counts = count_vect.fit_transform(newTextData.CommentsTokenizedStemmedSnowball)
print("Vocabulary ",(count_vect.vocabulary_))
print("Number of words in vocabulary",len(count_vect.vocabulary_))
print("Shape of text vector",TD_counts.shape)
DF_TD_Counts=pd.DataFrame(TD_counts.toarray(), columns= count_vect.get_feature_names())
print(DF_TD_Counts)

Vocabulary  {'doe': 98, 'like': 170, 'way': 337, 'phone': 220, 'work': 347, 'difficult': 94, 'compar': 60, 'want': 335, 'know': 164, 'nearest': 198, 'store': 282, 'locat': 174, 'buy': 39, 'adit': 10, 'accessori': 4, 'text': 301, 'messag': 188, 'refer': 243, 'websit': 340, 'ask': 23, 'disabl': 97, 'wait': 334, 'web': 339, 'site': 271, 'need': 199, 'help': 141, 'learn': 168, 'use': 328, 'suggest': 288, 'rep': 245, 'teach': 296, 'new': 201, 'plan': 222, 'switch': 295, 'soon': 276, 'minut': 190, 'addit': 7, 'access': 3, 'ori': 211, 'said': 256, 'batteri': 27, 'asap': 22, 'claim': 55, 'charger': 51, 'realli': 237, 'veri': 331, 'result': 248, 'alway': 14, 'die': 92, 'current': 81, 'contract': 71, 'bateri': 26, 'chang': 49, 'ring': 251, 'tone': 311, 'lost': 178, 'direct': 96, 'anoth': 18, 'manual': 185, 'number': 204, 'becaus': 28, 'mr': 195, 'napeleon': 196, 'leroy': 169, 'expect': 113, 'signific': 267, 'better': 30, 'technic': 297, 'support': 290, 'pleas': 223, 'outbound': 213, 'list': 172,

# TF-IDF Matrix - Snowball

In [0]:
#Compute TF-IDF Matrix
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(DF_TD_Counts)
print(X_train_tfidf.shape)
DF_TF_IDF=pd.DataFrame(X_train_tfidf.toarray(),columns= DF_TD_Counts.columns)
print(DF_TF_IDF)

(2070, 354)
      3399   3g  abysm  access  accessori  ...  worst  wrong  xvyx  year  york
0     0.0   0.0  0.0    0.0     0.00000    ...  0.0    0.0    0.0   0.0   0.0 
1     0.0   0.0  0.0    0.0     0.27568    ...  0.0    0.0    0.0   0.0   0.0 
2     0.0   0.0  0.0    0.0     0.00000    ...  0.0    0.0    0.0   0.0   0.0 
3     0.0   0.0  0.0    0.0     0.00000    ...  0.0    0.0    0.0   0.0   0.0 
4     0.0   0.0  0.0    0.0     0.00000    ...  0.0    0.0    0.0   0.0   0.0 
...   ...   ...  ...    ...         ...    ...  ...    ...    ...   ...   ... 
2065  0.0   0.0  0.0    0.0     0.00000    ...  0.0    0.0    0.0   0.0   0.0 
2066  0.0   0.0  0.0    0.0     0.00000    ...  0.0    0.0    0.0   0.0   0.0 
2067  0.0   0.0  0.0    0.0     0.00000    ...  0.0    0.0    0.0   0.0   0.0 
2068  0.0   0.0  0.0    0.0     0.00000    ...  0.0    0.0    0.0   0.0   0.0 
2069  0.0   0.0  0.0    0.0     0.00000    ...  0.0    0.0    0.0   0.0   0.0 

[2070 rows x 354 columns]


# Feature Selection - Filter - Snowball

In [0]:
#Feature selection
selector = SelectKBest(score_func=chi2, k=30)
selector_fit = selector.fit(DF_TF_IDF,y_train)
#get scores along with features
names = DF_TF_IDF.columns.values[selector.get_support()]
scores = selector.scores_[selector.get_support()]
names_scores = list(zip(names, scores))
ns_df = pd.DataFrame(data = names_scores, columns= ['Feat_names','F_Scores'])
ns_df_sorted = ns_df.sort_values(['F_Scores','Feat_names'], ascending = [False, True])
print(ns_df_sorted)


new_DF_TF_IDF = selector.fit_transform(DF_TF_IDF,y_train)
print("Shape of TF-IDF after feature selection", new_DF_TF_IDF.shape)

DF_TF_IDF_SelectedFeatures= pd.DataFrame(new_DF_TF_IDF, columns = list(DF_TF_IDF.columns[selector.get_support(indices=True)]) )
print(DF_TF_IDF_SelectedFeatures)

   Feat_names  F_Scores
19  receiv     4.025158
26  transeff   3.563271
1   alway      3.464947
23  screen     3.174275
16  ot         2.574465
27  turn       2.574465
5   continu    2.468678
9   figur      2.306164
7   explain    2.162567
3   charg      2.145596
8   famili     1.837066
28  unlimit    1.828956
13  market     1.818156
17  peopl      1.818156
22  rid        1.818156
25  sold       1.818156
10  hochi      1.556822
15  momma      1.556822
14  minut      1.547129
20  relat      1.489672
0   adress     1.446067
6   current    1.435394
21  result     1.435394
2   chang      1.404096
18  plan       1.382037
12  manag      1.381503
24  signal     1.379083
29  weak       1.379083
4   charger    1.297766
11  internet   1.294745
Shape of TF-IDF after feature selection (2070, 30)
      adress  alway     chang     charg  ...  transeff  turn  unlimit  weak
0     0.0     0.0    0.000000  0.000000  ...  0.0       0.0   0.0      0.0 
1     0.0     0.0    0.000000  0.000000  ...  0.0    

In [0]:
#split data into training and test
#split the entire training data in training and test data
X_Train, X_Test, Y_Train, Y_Test = train_test_split( DF_TF_IDF_SelectedFeatures, y_train, test_size=0.20, random_state=42)
print("Initial shape for entire data:",DF_TF_IDF_SelectedFeatures.shape)
print("Shape of new training data:", X_Train.shape)
print("Shape of new test split data:", X_Test.shape)

Initial shape for entire data: (2070, 30)
Shape of new training data: (1656, 30)
Shape of new test split data: (414, 30)


# Classification model on only comments - Random Forest

In [0]:
#Construct a Random Forest Classifier on text data
rfc=RandomForestClassifier()
rfc.fit(X_Train,Y_Train)

#Prediction on training data
pred_rf=pd.DataFrame(rfc.predict(X_Train),columns=["Prediction"])
print("----------------------------Random Forest: Training Data------------------------------------------------\n")
print("Accuracy Score on training data using Random Forest:",accuracy_score(Y_Train,pred_rf["Prediction"]))
print("Confusion Matrix on training data using Random Forest\n", confusion_matrix(Y_Train,pred_rf["Prediction"]))

# prediction on test data
pred_rf=pd.DataFrame(rfc.predict(X_Test),columns=["Prediction"])
print("----------------------------Random Forest: Test Data------------------------------------------------\n")
print("Accuracy Score on test data using Random Forest:",accuracy_score(Y_Test,pred_rf["Prediction"]))
print("Balanced Accuracy Score on test data using Random Forest:",balanced_accuracy_score(Y_Test,pred_rf["Prediction"]))
print("Confusion Matrix on test data using Random Forest\n", confusion_matrix(Y_Test,pred_rf["Prediction"]))
print("Classification report on test data using Random Forest\n", classification_report(Y_Test,pred_rf["Prediction"]))


----------------------------Random Forest: Training Data------------------------------------------------

Accuracy Score on training data using Random Forest: 0.6280193236714976
Confusion Matrix on training data using Random Forest
 [[ 74 573]
 [ 43 966]]
----------------------------Random Forest: Test Data------------------------------------------------

Accuracy Score on test data using Random Forest: 0.6231884057971014
Balanced Accuracy Score on test data using Random Forest: 0.5217725346353069
Confusion Matrix on test data using Random Forest
 [[ 16 141]
 [ 15 242]]
Classification report on test data using Random Forest
               precision    recall  f1-score   support

   Cancelled       0.52      0.10      0.17       157
     Current       0.63      0.94      0.76       257

    accuracy                           0.62       414
   macro avg       0.57      0.52      0.46       414
weighted avg       0.59      0.62      0.53       414





#Classification model on only customer data - Random Forest

In [0]:
CustInfoData = CustInfoData.drop(columns=["TARGET"])
#Do one Hot encoding for categorical features
X_cat = ["Sex","Status","Car_Owner","Paymethod","LocalBilltype","LongDistanceBilltype"]
#X_cat = CustInfoData.select_dtypes(include=['object'])
customer_one_hot = pd.get_dummies(CustInfoData,columns=X_cat)

customer_one_hot = pd.DataFrame(customer_one_hot)
print(customer_one_hot.head())

   ID  ...  LongDistanceBilltype_Standard
0  1   ...  0                            
1  6   ...  1                            
2  8   ...  1                            
3  11  ...  1                            
4  14  ...  0                            

[5 rows x 24 columns]


In [0]:
#split data into training and test
#split the entire training data in training and test data
X_Train, X_Test, Y_Train, Y_Test = train_test_split(customer_one_hot, y_train, test_size=0.20, random_state=42)
print("Initial shape for entire data:",DF_TF_IDF_SelectedFeatures.shape)
print("Shape of new training data:", X_Train.shape)
print("Shape of new test split data:", X_Test.shape)

Initial shape for entire data: (2070, 30)
Shape of new training data: (1656, 24)
Shape of new test split data: (414, 24)


In [0]:
#Construct a Random Forest Classifier on text data
rfc=RandomForestClassifier()
rfc.fit(X_Train,Y_Train)

#Prediction on training data
pred_rf=pd.DataFrame(rfc.predict(X_Train),columns=["Prediction"])
print("----------------------------Random Forest: Training Data------------------------------------------------\n")
print("Accuracy Score on training data using Random Forest:",accuracy_score(Y_Train,pred_rf["Prediction"]))
print("Confusion Matrix on training data using Random Forest\n", confusion_matrix(Y_Train,pred_rf["Prediction"]))

# prediction on test data
pred_rf=pd.DataFrame(rfc.predict(X_Test),columns=["Prediction"])
print("----------------------------Random Forest: Test Data------------------------------------------------\n")
print("Accuracy Score on test data using Random Forest:",accuracy_score(Y_Test,pred_rf["Prediction"]))
print("Balanced Accuracy Score on test data using Random Forest:",balanced_accuracy_score(Y_Test,pred_rf["Prediction"]))
print("Confusion Matrix on test data using Random Forest\n", confusion_matrix(Y_Test,pred_rf["Prediction"]))
print("Classification report on test data using Random Forest\n", classification_report(Y_Test,pred_rf["Prediction"]))

----------------------------Random Forest: Training Data------------------------------------------------

Accuracy Score on training data using Random Forest: 0.9897342995169082
Confusion Matrix on training data using Random Forest
 [[643   4]
 [ 13 996]]
----------------------------Random Forest: Test Data------------------------------------------------

Accuracy Score on test data using Random Forest: 0.8599033816425121
Balanced Accuracy Score on test data using Random Forest: 0.854940642890778
Confusion Matrix on test data using Random Forest
 [[131  26]
 [ 32 225]]
Classification report on test data using Random Forest
               precision    recall  f1-score   support

   Cancelled       0.80      0.83      0.82       157
     Current       0.90      0.88      0.89       257

    accuracy                           0.86       414
   macro avg       0.85      0.85      0.85       414
weighted avg       0.86      0.86      0.86       414





# Classification model on comments + customer data - Random Forest

In [0]:
#Merge files comments and customer data
combined=pd.concat([customer_one_hot, DF_TF_IDF_SelectedFeatures], axis=1)
print(combined.shape)
print(combined)

(2070, 54)
        ID  Children  Est_Income   Usage  ...  transeff  turn  unlimit  weak
0     1     1         38000.00    229.64  ...  0.0       0.0   0.0      0.0 
1     6     2         29616.00    75.29   ...  0.0       0.0   0.0      0.0 
2     8     0         19732.80    47.25   ...  0.0       0.0   0.0      0.0 
3     11    2         96.33       59.01   ...  0.0       0.0   0.0      0.0 
4     14    2         52004.80    28.14   ...  0.0       0.0   0.0      0.0 
...   ..   ..              ...      ...   ...  ...       ...   ...      ... 
2065  3821  0         78851.30    29.04   ...  0.0       0.0   0.0      0.0 
2066  3822  1         17540.70    36.20   ...  0.0       0.0   0.0      0.0 
2067  3823  0         83891.90    74.40   ...  0.0       0.0   0.0      0.0 
2068  3824  2         28220.80    38.95   ...  0.0       0.0   0.0      0.0 
2069  3825  0         28589.10    100.28  ...  0.0       0.0   0.0      0.0 

[2070 rows x 54 columns]


In [0]:
#split data into training and test
#split the entire training data in training and test data
X_Train, X_Test, Y_Train, Y_Test = train_test_split( combined, y_train, test_size=0.20, random_state=42)
print("Initial shape for entire data:",DF_TF_IDF_SelectedFeatures.shape)
print("Shape of new training data:", X_Train.shape)
print("Shape of new test split data:", X_Test.shape)

Initial shape for entire data: (2070, 30)
Shape of new training data: (1656, 54)
Shape of new test split data: (414, 54)


In [0]:
#Construct a Random Forest Classifier on combined data
rfc=RandomForestClassifier()
rfc.fit(X_Train,Y_Train)

#Prediction on training data
pred_rf=pd.DataFrame(rfc.predict(X_Train),columns=["Prediction"])
print("----------------------------Random Forest: Training Data------------------------------------------------\n")
print("Accuracy Score on training data using Random Forest:",accuracy_score(Y_Train,pred_rf["Prediction"]))
print("Confusion Matrix on training data using Random Forest\n", confusion_matrix(Y_Train,pred_rf["Prediction"]))

# prediction on test data
pred_rf=pd.DataFrame(rfc.predict(X_Test),columns=["Prediction"])
print("----------------------------Random Forest: Test Data------------------------------------------------\n")
print("Accuracy Score on test data using Random Forest:",accuracy_score(Y_Test,pred_rf["Prediction"]))
print("Balanced Accuracy Score on test data using Random Forest:",balanced_accuracy_score(Y_Test,pred_rf["Prediction"]))
print("Confusion Matrix on test data using Random Forest\n", confusion_matrix(Y_Test,pred_rf["Prediction"]))
print("Classification report on test data using Random Forest\n", classification_report(Y_Test,pred_rf["Prediction"]))

----------------------------Random Forest: Training Data------------------------------------------------

Accuracy Score on training data using Random Forest: 0.9963768115942029
Confusion Matrix on training data using Random Forest
 [[ 645    2]
 [   4 1005]]
----------------------------Random Forest: Test Data------------------------------------------------

Accuracy Score on test data using Random Forest: 0.8671497584541062
Balanced Accuracy Score on test data using Random Forest: 0.8582988425983296
Confusion Matrix on test data using Random Forest
 [[129  28]
 [ 27 230]]
Classification report on test data using Random Forest
               precision    recall  f1-score   support

   Cancelled       0.83      0.82      0.82       157
     Current       0.89      0.89      0.89       257

    accuracy                           0.87       414
   macro avg       0.86      0.86      0.86       414
weighted avg       0.87      0.87      0.87       414





# Feature selection - Wrapper method - Snowball

In [0]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
#Construct a Random Forest Classifier on text data
rfc=RandomForestClassifier()

sfs1 = SFS(rfc, 
           k_features=20, 
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='accuracy',
           cv=0)

sfs1 = sfs1.fit(DF_TF_IDF,y_train, custom_feature_names= DF_TF_IDF.columns)
print("Top 20 feature", sfs1.k_feature_names_)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 354 out of 354 | elapsed:    7.7s finished

[2019-12-11 01:19:47] Features: 1/20 -- score: 0.6236714975845411[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 353 out of 353 | elapsed:    8.9s finished

[2019-12-11 01:19:56] Features: 2/20 -- score: 0.6275362318840579[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 352 out of 352 | elapsed:    9.0s finished

[2019-12-11 01:20:05] Features: 3/20 -- score: 0.6318840579710145[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  

Top 20 feature ('3399', '3g', 'abysm', 'access', 'accessori', 'adapt', 'add', 'addit', 'additon', 'address', 'advertis', 'afraid', 'alway', 'angel', 'ani', 'complain', 'expect', 'hochi', 'phone', 'want')


[Parallel(n_jobs=1)]: Done 335 out of 335 | elapsed:   10.7s finished

[2019-12-11 01:22:56] Features: 20/20 -- score: 0.6352657004830918

In [0]:
#top 20 features after Forward
print("Top 20 feature", sfs1.k_feature_names_)

DF_TF_IDF_SelectedFeatures = DF_TF_IDF.loc[:, DF_TF_IDF.columns.isin(list(sfs1.k_feature_names_))]
print("Shape of TF-IDF after feature selection", DF_TF_IDF_SelectedFeatures.shape)
print(DF_TF_IDF_SelectedFeatures)

Top 20 feature ('3399', '3g', 'abysm', 'access', 'accessori', 'adapt', 'add', 'addit', 'additon', 'address', 'advertis', 'afraid', 'alway', 'angel', 'ani', 'complain', 'expect', 'hochi', 'phone', 'want')
Shape of TF-IDF after feature selection (2070, 20)
      3399   3g  abysm  access  ...  expect  hochi     phone      want
0     0.0   0.0  0.0    0.0     ...  0.0     0.0    0.311374  0.000000
1     0.0   0.0  0.0    0.0     ...  0.0     0.0    0.000000  0.312065
2     0.0   0.0  0.0    0.0     ...  0.0     0.0    0.000000  0.195324
3     0.0   0.0  0.0    0.0     ...  0.0     0.0    0.000000  0.000000
4     0.0   0.0  0.0    0.0     ...  0.0     0.0    0.243227  0.000000
...   ...   ...  ...    ...     ...  ...     ...         ...       ...
2065  0.0   0.0  0.0    0.0     ...  0.0     0.0    0.000000  0.000000
2066  0.0   0.0  0.0    0.0     ...  0.0     0.0    0.180489  0.000000
2067  0.0   0.0  0.0    0.0     ...  0.0     0.0    0.178295  0.144882
2068  0.0   0.0  0.0    0.0     ...

In [0]:
#split data into training and test
#split the entire training data in training and test data
X_Train, X_Test, Y_Train, Y_Test = train_test_split( DF_TF_IDF_SelectedFeatures, y_train, test_size=0.20, random_state=42)
print("Initial shape for entire data:",DF_TF_IDF_SelectedFeatures.shape)
print("Shape of new training data:", X_Train.shape)
print("Shape of new test split data:", X_Test.shape)

Initial shape for entire data: (2070, 20)
Shape of new training data: (1656, 20)
Shape of new test split data: (414, 20)


# Classification on comments - Wrapper method

In [0]:
#Construct a Random Forest Classifier on text data
rfc=RandomForestClassifier()
rfc.fit(X_Train,Y_Train)

#Prediction on training data
pred_rf=pd.DataFrame(rfc.predict(X_Train),columns=["Prediction"])
print("----------------------------Random Forest: Training Data------------------------------------------------\n")
print("Accuracy Score on training data using Random Forest:",accuracy_score(Y_Train,pred_rf["Prediction"]))
print("Confusion Matrix on training data using Random Forest\n", confusion_matrix(Y_Train,pred_rf["Prediction"]))

# prediction on test data
pred_rf=pd.DataFrame(rfc.predict(X_Test),columns=["Prediction"])
print("----------------------------Random Forest: Test Data------------------------------------------------\n")
print("Accuracy Score on test data using Random Forest:",accuracy_score(Y_Test,pred_rf["Prediction"]))
print("Balanced Accuracy Score on test data using Random Forest:",balanced_accuracy_score(Y_Test,pred_rf["Prediction"]))
print("Confusion Matrix on test data using Random Forest\n", confusion_matrix(Y_Test,pred_rf["Prediction"]))
print("Classification report on test data using Random Forest\n", classification_report(Y_Test,pred_rf["Prediction"]))

----------------------------Random Forest: Training Data------------------------------------------------

Accuracy Score on training data using Random Forest: 0.6364734299516909
Confusion Matrix on training data using Random Forest
 [[ 93 554]
 [ 48 961]]
----------------------------Random Forest: Test Data------------------------------------------------

Accuracy Score on test data using Random Forest: 0.6135265700483091
Balanced Accuracy Score on test data using Random Forest: 0.5152296215519592
Confusion Matrix on test data using Random Forest
 [[ 17 140]
 [ 20 237]]
Classification report on test data using Random Forest
               precision    recall  f1-score   support

   Cancelled       0.46      0.11      0.18       157
     Current       0.63      0.92      0.75       257

    accuracy                           0.61       414
   macro avg       0.54      0.52      0.46       414
weighted avg       0.56      0.61      0.53       414





# Classification on customers + comments - Wrapper method

In [0]:
#Merge files comments and customer data
combined=pd.concat([customer_one_hot, DF_TF_IDF_SelectedFeatures], axis=1)
print(combined.shape)
print(combined)

(2070, 44)
        ID  Children  Est_Income   Usage  ...  expect  hochi     phone      want
0     1     1         38000.00    229.64  ...  0.0     0.0    0.311374  0.000000
1     6     2         29616.00    75.29   ...  0.0     0.0    0.000000  0.312065
2     8     0         19732.80    47.25   ...  0.0     0.0    0.000000  0.195324
3     11    2         96.33       59.01   ...  0.0     0.0    0.000000  0.000000
4     14    2         52004.80    28.14   ...  0.0     0.0    0.243227  0.000000
...   ..   ..              ...      ...   ...  ...     ...         ...       ...
2065  3821  0         78851.30    29.04   ...  0.0     0.0    0.000000  0.000000
2066  3822  1         17540.70    36.20   ...  0.0     0.0    0.180489  0.000000
2067  3823  0         83891.90    74.40   ...  0.0     0.0    0.178295  0.144882
2068  3824  2         28220.80    38.95   ...  0.0     0.0    0.000000  0.324251
2069  3825  0         28589.10    100.28  ...  0.0     0.0    0.180489  0.000000

[2070 rows x 44 

In [0]:
#split data into training and test
#split the entire training data in training and test data
X_Train, X_Test, Y_Train, Y_Test = train_test_split( combined, y_train, test_size=0.20, random_state=42)
print("Initial shape for entire data:",DF_TF_IDF_SelectedFeatures.shape)
print("Shape of new training data:", X_Train.shape)
print("Shape of new test split data:", X_Test.shape)

Initial shape for entire data: (2070, 20)
Shape of new training data: (1656, 44)
Shape of new test split data: (414, 44)


In [0]:
#Construct a Random Forest Classifier on combined data
rfc=RandomForestClassifier()
rfc.fit(X_Train,Y_Train)

#Prediction on training data
pred_rf=pd.DataFrame(rfc.predict(X_Train),columns=["Prediction"])
print("----------------------------Random Forest: Training Data------------------------------------------------\n")
print("Accuracy Score on training data using Random Forest:",accuracy_score(Y_Train,pred_rf["Prediction"]))
print("Confusion Matrix on training data using Random Forest\n", confusion_matrix(Y_Train,pred_rf["Prediction"]))

# prediction on test data
pred_rf=pd.DataFrame(rfc.predict(X_Test),columns=["Prediction"])
print("----------------------------Random Forest: Test Data------------------------------------------------\n")
print("Accuracy Score on test data using Random Forest:",accuracy_score(Y_Test,pred_rf["Prediction"]))
print("Balanced Accuracy Score on test data using Random Forest:",balanced_accuracy_score(Y_Test,pred_rf["Prediction"]))
print("Confusion Matrix on test data using Random Forest\n", confusion_matrix(Y_Test,pred_rf["Prediction"]))
print("Classification report on test data using Random Forest\n", classification_report(Y_Test,pred_rf["Prediction"]))


----------------------------Random Forest: Training Data------------------------------------------------

Accuracy Score on training data using Random Forest: 0.9897342995169082
Confusion Matrix on training data using Random Forest
 [[641   6]
 [ 11 998]]
----------------------------Random Forest: Test Data------------------------------------------------

Accuracy Score on test data using Random Forest: 0.8695652173913043
Balanced Accuracy Score on test data using Random Forest: 0.8627227440580931
Confusion Matrix on test data using Random Forest
 [[131  26]
 [ 28 229]]
Classification report on test data using Random Forest
               precision    recall  f1-score   support

   Cancelled       0.82      0.83      0.83       157
     Current       0.90      0.89      0.89       257

    accuracy                           0.87       414
   macro avg       0.86      0.86      0.86       414
weighted avg       0.87      0.87      0.87       414



