In [0]:
import pandas as pd
import numpy as np

import nltk
nltk.download('punkt')
# word tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
#stemmers
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
#lematize
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('wordnet')
#term vector and tfidf
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# Import libraries for feature selection - Filter method
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

#traintest split
from sklearn.model_selection import train_test_split

#Models
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,balanced_accuracy_score
from sklearn import metrics

pd.set_option('display.max_colwidth', -2)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [0]:
from google.colab import drive
drive.mount('/gdrive')
#Change current working directory to gdrive
%cd /gdrive

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /gdrive
/gdrive


In [0]:
#Read files
textfile = r'/gdrive/My Drive/CIS 508 Python/Assignment-6/Comments.csv'
textData = pd.read_csv(textfile) #creates a dataframe

CustInfofile = r'/gdrive/My Drive/CIS 508 Python/Assignment-6/Customers.csv'
CustInfoData = pd.read_csv(CustInfofile)  #creates a dataframe

print(textData.shape)
print(CustInfoData.shape)
textData

(2070, 2)
(2070, 17)


Unnamed: 0,ID,Comments
0,1309,Does not like the way the phone works. It is to difficult compared to his last phone.
1,3556,Wanted to know the nearest store location. Wants to buy aditional accessories.
2,2230,Wants to know how to do text messaging. Referred him to website.
3,2312,Asked how to disable call waiting. referred him to web site.
4,3327,Needs help learning how to use the phone. I suggested he go back to the store and have the rep teach him.
...,...,...
2065,3034,Needed help figuring out his bill. I explained our minute charges.
2066,271,He lost his phone and called to cancel service. I told him we would suspend until we hear back from him. He will contact us soon.
2067,783,Lost the directions to phone and wants another manual. I referred him to web site.
2068,1295,Wants to change address.


In [0]:
#Extract target column from Customer Info file
y_train = CustInfoData["TARGET"]
X_train = CustInfoData.drop(columns=["TARGET"]) #extracting training data without the target column
                     
print(X_train.shape)
print(textData.shape)
textData.head()
print(y_train)

(2070, 16)
(2070, 2)
0       Cancelled
1       Current  
2       Current  
3       Current  
4       Cancelled
          ...    
2065    Cancelled
2066    Cancelled
2067    Cancelled
2068    Cancelled
2069    Cancelled
Name: TARGET, Length: 2070, dtype: object


# Word Tokenize

In [0]:
#tokenise the sentences into words
textData['CommentsTokenized'] = textData['Comments'].apply(word_tokenize)
print(textData['CommentsTokenized'])

0       [Does, not, like, the, way, the, phone, works, ., It, is, to, difficult, compared, to, his, last, phone, .]                                                       
1       [Wanted, to, know, the, nearest, store, location, ., Wants, to, buy, aditional, accessories, .]                                                                   
2       [Wants, to, know, how, to, do, text, messaging, ., Referred, him, to, website, .]                                                                                 
3       [Asked, how, to, disable, call, waiting, ., referred, him, to, web, site, .]                                                                                      
4       [Needs, help, learning, how, to, use, the, phone, ., I, suggested, he, go, back, to, the, store, and, have, the, rep, teach, him, .]                              
                                                                        ...                                                                      

# Porter Stemming

In [0]:
#Porter stemming
# Use English stemmer.
porter = PorterStemmer()
newTextData=pd.DataFrame()
newTextData=textData.drop(columns=["CommentsTokenized","Comments"])
newTextData['CommentsTokenizedStemmedPorter'] = textData['CommentsTokenized'].apply(lambda x: [porter.stem(y) for y in x])
print('-------------Porter Stemming------------------')
newTextData['CommentsTokenizedStemmedPorter']
#Join stemmed strings
newTextData['CommentsTokenizedStemmedPorter'] = newTextData['CommentsTokenizedStemmedPorter'].apply(lambda x: " ".join(x))
newTextData['CommentsTokenizedStemmedPorter']

-------------Porter Stemming------------------


0       doe not like the way the phone work . It is to difficult compar to hi last phone .                                              
1       want to know the nearest store locat . want to buy adit accessori .                                                             
2       want to know how to do text messag . refer him to websit .                                                                      
3       ask how to disabl call wait . refer him to web site .                                                                           
4       need help learn how to use the phone . I suggest he go back to the store and have the rep teach him .                           
                                                        ...                                                                             
2065    need help figur out hi bill . I explain our minut charg .                                                                       
2066    He lost hi phone and call to canc

# Creating Vector - Porter

In [0]:
#Do Bag-Of-Words model - Term - Document Matrix - Snowball
#Learn the vocabulary dictionary and return term-document matrix.
#count_vect = CountVectorizer(stop_words=None)
count_vect = CountVectorizer(stop_words='english',lowercase=False)
count_vect
TD_counts = count_vect.fit_transform(newTextData.CommentsTokenizedStemmedPorter)
print("Vocabulary ",(count_vect.vocabulary_))
print("Number of words in vocabulary",len(count_vect.vocabulary_))
print("Shape of text vector",TD_counts.shape)
DF_TD_Counts=pd.DataFrame(TD_counts.toarray(), columns= count_vect.get_feature_names())
print(DF_TD_Counts)

Vocabulary  {'doe': 106, 'like': 181, 'way': 349, 'phone': 231, 'work': 359, 'It': 8, 'difficult': 102, 'compar': 68, 'hi': 152, 'want': 347, 'know': 175, 'nearest': 209, 'store': 293, 'locat': 186, 'buy': 47, 'adit': 18, 'accessori': 12, 'text': 312, 'messag': 199, 'refer': 254, 'websit': 352, 'ask': 31, 'disabl': 105, 'wait': 346, 'web': 351, 'site': 282, 'need': 210, 'help': 151, 'learn': 179, 'use': 340, 'suggest': 299, 'rep': 256, 'teach': 307, 'new': 212, 'plan': 233, 'switch': 306, 'soon': 287, 'minut': 201, 'addit': 15, 'access': 11, 'ori': 222, 'said': 267, 'batteri': 35, 'ha': 143, 'asap': 30, 'He': 4, 'claim': 63, 'charger': 59, 'realli': 248, 'veri': 343, 'As': 2, 'result': 259, 'wa': 345, 'alway': 22, 'die': 100, 'current': 89, 'contract': 79, 'bateri': 34, 'chang': 57, 'ring': 262, 'tone': 323, 'lost': 189, 'direct': 104, 'anoth': 26, 'manual': 196, 'number': 215, 'becaus': 36, 'mr': 206, 'napeleon': 207, 'leroy': 180, 'expect': 121, 'significantli': 278, 'better': 38, 't

# TF-IDF Matrix 

In [0]:
#Compute TF-IDF Matrix
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(DF_TD_Counts)
print(X_train_tfidf.shape)
DF_TF_IDF=pd.DataFrame(X_train_tfidf.toarray(),columns= DF_TD_Counts.columns)
print(DF_TF_IDF)

(2070, 366)
      3399   3g   As   CC        He   If  ...  wors  worst  wrong  xvyx  year  york
0     0.0   0.0  0.0  0.0  0.000000  0.0  ...  0.0   0.0    0.0    0.0   0.0   0.0 
1     0.0   0.0  0.0  0.0  0.000000  0.0  ...  0.0   0.0    0.0    0.0   0.0   0.0 
2     0.0   0.0  0.0  0.0  0.000000  0.0  ...  0.0   0.0    0.0    0.0   0.0   0.0 
3     0.0   0.0  0.0  0.0  0.000000  0.0  ...  0.0   0.0    0.0    0.0   0.0   0.0 
4     0.0   0.0  0.0  0.0  0.000000  0.0  ...  0.0   0.0    0.0    0.0   0.0   0.0 
...   ...   ...  ...  ...       ...  ...  ...  ...   ...    ...    ...   ...   ... 
2065  0.0   0.0  0.0  0.0  0.000000  0.0  ...  0.0   0.0    0.0    0.0   0.0   0.0 
2066  0.0   0.0  0.0  0.0  0.407251  0.0  ...  0.0   0.0    0.0    0.0   0.0   0.0 
2067  0.0   0.0  0.0  0.0  0.000000  0.0  ...  0.0   0.0    0.0    0.0   0.0   0.0 
2068  0.0   0.0  0.0  0.0  0.000000  0.0  ...  0.0   0.0    0.0    0.0   0.0   0.0 
2069  0.0   0.0  0.0  0.0  0.407251  0.0  ...  0.0   0.0    0.0 

# Feature Selection - Filter 

In [0]:
#Feature selection
selector = SelectKBest(score_func=chi2, k=30)
selector_fit = selector.fit(DF_TF_IDF,y_train)
#get scores along with features
names = DF_TF_IDF.columns.values[selector.get_support()]
scores = selector.scores_[selector.get_support()]
names_scores = list(zip(names, scores))
ns_df = pd.DataFrame(data = names_scores, columns= ['Feat_names','F_Scores'])
ns_df_sorted = ns_df.sort_values(['F_Scores','Feat_names'], ascending = [False, True])
print(ns_df_sorted)


new_DF_TF_IDF = selector.fit_transform(DF_TF_IDF,y_train)
print("Shape of TF-IDF after feature selection", new_DF_TF_IDF.shape)

DF_TF_IDF_SelectedFeatures= pd.DataFrame(new_DF_TF_IDF, columns = list(DF_TF_IDF.columns[selector.get_support(indices=True)]) )
print(DF_TF_IDF_SelectedFeatures)

   Feat_names  F_Scores
19  receiv     3.660474
26  transeff   3.563271
2   alway      3.203456
23  screen     3.174275
3   cc         2.320970
16  ot         2.320970
27  turn       2.320970
6   continu    2.318220
9   figur      2.242673
7   explain    2.102758
5   charg      2.052376
8   famili     1.837066
28  unlimit    1.828956
13  market     1.673674
17  peopl      1.673674
22  rid        1.673674
25  sold       1.673674
10  hochi      1.556822
15  momma      1.556822
14  minut      1.496702
1   adress     1.446067
20  relat      1.437909
4   chang      1.414206
18  plan       1.379106
0   address    1.322337
24  signal     1.321208
29  weak       1.321208
12  manag      1.305570
11  internet   1.296248
21  result     1.252664
Shape of TF-IDF after feature selection (2070, 30)
       address  adress  alway   cc  ...  transeff  turn  unlimit  weak
0     0.000000  0.0     0.0    0.0  ...  0.0       0.0   0.0      0.0 
1     0.000000  0.0     0.0    0.0  ...  0.0       0.0   0.0   

In [0]:
#split data into training and test
#split the entire training data in training and test data
X_Train, X_Test, Y_Train, Y_Test = train_test_split( DF_TF_IDF_SelectedFeatures, y_train, test_size=0.20, random_state=42)
print("Initial shape for entire data:",DF_TF_IDF_SelectedFeatures.shape)
print("Shape of new training data:", X_Train.shape)
print("Shape of new test split data:", X_Test.shape)

Initial shape for entire data: (2070, 30)
Shape of new training data: (1656, 30)
Shape of new test split data: (414, 30)


# Classification model on only comments - Random Forest - Filter

In [0]:
#Construct a Random Forest Classifier on text data
rfc=RandomForestClassifier()
rfc.fit(X_Train,Y_Train)

#Prediction on training data
pred_rf=pd.DataFrame(rfc.predict(X_Train),columns=["Prediction"])
print("----------------------------Random Forest: Training Data------------------------------------------------\n")
print("Accuracy Score on training data using Random Forest:",accuracy_score(Y_Train,pred_rf["Prediction"]))
print("Confusion Matrix on training data using Random Forest\n", confusion_matrix(Y_Train,pred_rf["Prediction"]))

# prediction on test data
pred_rf=pd.DataFrame(rfc.predict(X_Test),columns=["Prediction"])
print("----------------------------Random Forest: Test Data------------------------------------------------\n")
print("Accuracy Score on test data using Random Forest:",accuracy_score(Y_Test,pred_rf["Prediction"]))
print("Balanced Accuracy Score on test data using Random Forest:",balanced_accuracy_score(Y_Test,pred_rf["Prediction"]))
print("Confusion Matrix on test data using Random Forest\n", confusion_matrix(Y_Test,pred_rf["Prediction"]))
print("Classification report on test data using Random Forest\n", classification_report(Y_Test,pred_rf["Prediction"]))


----------------------------Random Forest: Training Data------------------------------------------------

Accuracy Score on training data using Random Forest: 0.6268115942028986
Confusion Matrix on training data using Random Forest
 [[ 63 584]
 [ 34 975]]
----------------------------Random Forest: Test Data------------------------------------------------

Accuracy Score on test data using Random Forest: 0.6207729468599034
Balanced Accuracy Score on test data using Random Forest: 0.5148702570076087
Confusion Matrix on test data using Random Forest
 [[ 12 145]
 [ 12 245]]
Classification report on test data using Random Forest
               precision    recall  f1-score   support

   Cancelled       0.50      0.08      0.13       157
     Current       0.63      0.95      0.76       257

    accuracy                           0.62       414
   macro avg       0.56      0.51      0.44       414
weighted avg       0.58      0.62      0.52       414





#customer data 

In [0]:
CustInfoData = CustInfoData.drop(columns=["TARGET"])
#Do one Hot encoding for categorical features
X_cat = ["Sex","Status","Car_Owner","Paymethod","LocalBilltype","LongDistanceBilltype"]
#X_cat = CustInfoData.select_dtypes(include=['object'])
customer_one_hot = pd.get_dummies(CustInfoData,columns=X_cat)

customer_one_hot = pd.DataFrame(customer_one_hot)
print(customer_one_hot.head())

   ID  ...  LongDistanceBilltype_Standard
0  1   ...  0                            
1  6   ...  1                            
2  8   ...  1                            
3  11  ...  1                            
4  14  ...  0                            

[5 rows x 24 columns]


# Classification model on comments + customer data - Random Forest - Filter

In [0]:
#Merge files comments and customer data
combined=pd.concat([customer_one_hot, DF_TF_IDF_SelectedFeatures], axis=1)
print(combined.shape)
print(combined)

(2070, 54)
        ID  Children  Est_Income   Usage  ...  transeff  turn  unlimit  weak
0     1     1         38000.00    229.64  ...  0.0       0.0   0.0      0.0 
1     6     2         29616.00    75.29   ...  0.0       0.0   0.0      0.0 
2     8     0         19732.80    47.25   ...  0.0       0.0   0.0      0.0 
3     11    2         96.33       59.01   ...  0.0       0.0   0.0      0.0 
4     14    2         52004.80    28.14   ...  0.0       0.0   0.0      0.0 
...   ..   ..              ...      ...   ...  ...       ...   ...      ... 
2065  3821  0         78851.30    29.04   ...  0.0       0.0   0.0      0.0 
2066  3822  1         17540.70    36.20   ...  0.0       0.0   0.0      0.0 
2067  3823  0         83891.90    74.40   ...  0.0       0.0   0.0      0.0 
2068  3824  2         28220.80    38.95   ...  0.0       0.0   0.0      0.0 
2069  3825  0         28589.10    100.28  ...  0.0       0.0   0.0      0.0 

[2070 rows x 54 columns]


In [0]:
#split data into training and test
#split the entire training data in training and test data
X_Train, X_Test, Y_Train, Y_Test = train_test_split( combined, y_train, test_size=0.20, random_state=42)
print("Initial shape for entire data:",DF_TF_IDF_SelectedFeatures.shape)
print("Shape of new training data:", X_Train.shape)
print("Shape of new test split data:", X_Test.shape)

Initial shape for entire data: (2070, 30)
Shape of new training data: (1656, 54)
Shape of new test split data: (414, 54)


In [0]:
#Construct a Random Forest Classifier on combined data
rfc=RandomForestClassifier()
rfc.fit(X_Train,Y_Train)

#Prediction on training data
pred_rf=pd.DataFrame(rfc.predict(X_Train),columns=["Prediction"])
print("----------------------------Random Forest: Training Data------------------------------------------------\n")
print("Accuracy Score on training data using Random Forest:",accuracy_score(Y_Train,pred_rf["Prediction"]))
print("Confusion Matrix on training data using Random Forest\n", confusion_matrix(Y_Train,pred_rf["Prediction"]))

# prediction on test data
pred_rf=pd.DataFrame(rfc.predict(X_Test),columns=["Prediction"])
print("----------------------------Random Forest: Test Data------------------------------------------------\n")
print("Accuracy Score on test data using Random Forest:",accuracy_score(Y_Test,pred_rf["Prediction"]))
print("Balanced Accuracy Score on test data using Random Forest:",balanced_accuracy_score(Y_Test,pred_rf["Prediction"]))
print("Confusion Matrix on test data using Random Forest\n", confusion_matrix(Y_Test,pred_rf["Prediction"]))
print("Classification report on test data using Random Forest\n", classification_report(Y_Test,pred_rf["Prediction"]))

----------------------------Random Forest: Training Data------------------------------------------------

Accuracy Score on training data using Random Forest: 0.9903381642512077
Confusion Matrix on training data using Random Forest
 [[645   2]
 [ 14 995]]
----------------------------Random Forest: Test Data------------------------------------------------

Accuracy Score on test data using Random Forest: 0.8671497584541062
Balanced Accuracy Score on test data using Random Forest: 0.8607772187662643
Confusion Matrix on test data using Random Forest
 [[131  26]
 [ 29 228]]
Classification report on test data using Random Forest
               precision    recall  f1-score   support

   Cancelled       0.82      0.83      0.83       157
     Current       0.90      0.89      0.89       257

    accuracy                           0.87       414
   macro avg       0.86      0.86      0.86       414
weighted avg       0.87      0.87      0.87       414





# Feature selection - Wrapper method 

In [0]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
#Construct a Random Forest Classifier on text data
rfc=RandomForestClassifier()

sfs1 = SFS(rfc, 
           k_features=20, 
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='accuracy',
           cv=0)

sfs1 = sfs1.fit(DF_TF_IDF,y_train, custom_feature_names= DF_TF_IDF.columns)
print("Top 20 feature", sfs1.k_feature_names_)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 366 out of 366 | elapsed:    7.6s finished

[2019-12-11 02:10:13] Features: 1/20 -- score: 0.6246376811594203[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 365 out of 365 | elapsed:    8.5s finished

[2019-12-11 02:10:21] Features: 2/20 -- score: 0.6304347826086957[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 364 out of 364 | elapsed:    9.2s finished

[2019-12-11 02:10:30] Features: 3/20 -- score: 0.6318840579710145[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  

Top 20 feature ('3399', '3g', 'As', 'He', 'In', 'Is', 'We', 'abysm', 'accessori', 'add', 'addit', 'additon', 'adress', 'afraid', 'alway', 'hi', 'hochi', 'support', 'want', 'work')


[Parallel(n_jobs=1)]: Done 347 out of 347 | elapsed:   10.5s finished

[2019-12-11 02:13:21] Features: 20/20 -- score: 0.6352657004830918

In [0]:
#top 20 features after Forward
print("Top 20 feature", sfs1.k_feature_names_)

DF_TF_IDF_SelectedFeatures = DF_TF_IDF.loc[:, DF_TF_IDF.columns.isin(list(sfs1.k_feature_names_))]
print("Shape of TF-IDF after feature selection", DF_TF_IDF_SelectedFeatures.shape)
print(DF_TF_IDF_SelectedFeatures)

Top 20 feature ('3399', '3g', 'As', 'He', 'In', 'Is', 'We', 'abysm', 'accessori', 'add', 'addit', 'additon', 'adress', 'afraid', 'alway', 'hi', 'hochi', 'support', 'want', 'work')
Shape of TF-IDF after feature selection (2070, 20)
      3399   3g   As        He  ...  hochi  support      want      work
0     0.0   0.0  0.0  0.000000  ...  0.0    0.0      0.000000  0.192632
1     0.0   0.0  0.0  0.000000  ...  0.0    0.0      0.312065  0.000000
2     0.0   0.0  0.0  0.000000  ...  0.0    0.0      0.195324  0.000000
3     0.0   0.0  0.0  0.000000  ...  0.0    0.0      0.000000  0.000000
4     0.0   0.0  0.0  0.000000  ...  0.0    0.0      0.000000  0.000000
...   ...   ...  ...       ...  ...  ...    ...           ...       ...
2065  0.0   0.0  0.0  0.000000  ...  0.0    0.0      0.000000  0.000000
2066  0.0   0.0  0.0  0.407251  ...  0.0    0.0      0.000000  0.000000
2067  0.0   0.0  0.0  0.000000  ...  0.0    0.0      0.144882  0.000000
2068  0.0   0.0  0.0  0.000000  ...  0.0    0.0  

In [0]:
#split data into training and test
#split the entire training data in training and test data
X_Train, X_Test, Y_Train, Y_Test = train_test_split( DF_TF_IDF_SelectedFeatures, y_train, test_size=0.20, random_state=42)
print("Initial shape for entire data:",DF_TF_IDF_SelectedFeatures.shape)
print("Shape of new training data:", X_Train.shape)
print("Shape of new test split data:", X_Test.shape)

Initial shape for entire data: (2070, 20)
Shape of new training data: (1656, 20)
Shape of new test split data: (414, 20)


# Classification on comments - Wrapper method

In [0]:
#Construct a Random Forest Classifier on text data
rfc=RandomForestClassifier()
rfc.fit(X_Train,Y_Train)

#Prediction on training data
pred_rf=pd.DataFrame(rfc.predict(X_Train),columns=["Prediction"])
print("----------------------------Random Forest: Training Data------------------------------------------------\n")
print("Accuracy Score on training data using Random Forest:",accuracy_score(Y_Train,pred_rf["Prediction"]))
print("Confusion Matrix on training data using Random Forest\n", confusion_matrix(Y_Train,pred_rf["Prediction"]))

# prediction on test data
pred_rf=pd.DataFrame(rfc.predict(X_Test),columns=["Prediction"])
print("----------------------------Random Forest: Test Data------------------------------------------------\n")
print("Accuracy Score on test data using Random Forest:",accuracy_score(Y_Test,pred_rf["Prediction"]))
print("Balanced Accuracy Score on test data using Random Forest:",balanced_accuracy_score(Y_Test,pred_rf["Prediction"]))
print("Confusion Matrix on test data using Random Forest\n", confusion_matrix(Y_Test,pred_rf["Prediction"]))
print("Classification report on test data using Random Forest\n", classification_report(Y_Test,pred_rf["Prediction"]))

----------------------------Random Forest: Training Data------------------------------------------------

Accuracy Score on training data using Random Forest: 0.6346618357487923
Confusion Matrix on training data using Random Forest
 [[ 82 565]
 [ 40 969]]
----------------------------Random Forest: Test Data------------------------------------------------

Accuracy Score on test data using Random Forest: 0.606280193236715
Balanced Accuracy Score on test data using Random Forest: 0.4994795410047337
Confusion Matrix on test data using Random Forest
 [[  9 148]
 [ 15 242]]
Classification report on test data using Random Forest
               precision    recall  f1-score   support

   Cancelled       0.38      0.06      0.10       157
     Current       0.62      0.94      0.75       257

    accuracy                           0.61       414
   macro avg       0.50      0.50      0.42       414
weighted avg       0.53      0.61      0.50       414





# Classification on customers + comments - Wrapper method

In [0]:
#Merge files comments and customer data
combined=pd.concat([customer_one_hot, DF_TF_IDF_SelectedFeatures], axis=1)
print(combined.shape)
print(combined)

(2070, 44)
        ID  Children  Est_Income   Usage  ...  hochi  support      want      work
0     1     1         38000.00    229.64  ...  0.0    0.0      0.000000  0.192632
1     6     2         29616.00    75.29   ...  0.0    0.0      0.312065  0.000000
2     8     0         19732.80    47.25   ...  0.0    0.0      0.195324  0.000000
3     11    2         96.33       59.01   ...  0.0    0.0      0.000000  0.000000
4     14    2         52004.80    28.14   ...  0.0    0.0      0.000000  0.000000
...   ..   ..              ...      ...   ...  ...    ...           ...       ...
2065  3821  0         78851.30    29.04   ...  0.0    0.0      0.000000  0.000000
2066  3822  1         17540.70    36.20   ...  0.0    0.0      0.000000  0.000000
2067  3823  0         83891.90    74.40   ...  0.0    0.0      0.144882  0.000000
2068  3824  2         28220.80    38.95   ...  0.0    0.0      0.324251  0.000000
2069  3825  0         28589.10    100.28  ...  0.0    0.0      0.000000  0.000000

[207

In [0]:
#split data into training and test
#split the entire training data in training and test data
X_Train, X_Test, Y_Train, Y_Test = train_test_split( combined, y_train, test_size=0.20, random_state=42)
print("Initial shape for entire data:",DF_TF_IDF_SelectedFeatures.shape)
print("Shape of new training data:", X_Train.shape)
print("Shape of new test split data:", X_Test.shape)

Initial shape for entire data: (2070, 20)
Shape of new training data: (1656, 44)
Shape of new test split data: (414, 44)


In [0]:
#Construct a Random Forest Classifier on combined data
rfc=RandomForestClassifier()
rfc.fit(X_Train,Y_Train)

#Prediction on training data
pred_rf=pd.DataFrame(rfc.predict(X_Train),columns=["Prediction"])
print("----------------------------Random Forest: Training Data------------------------------------------------\n")
print("Accuracy Score on training data using Random Forest:",accuracy_score(Y_Train,pred_rf["Prediction"]))
print("Confusion Matrix on training data using Random Forest\n", confusion_matrix(Y_Train,pred_rf["Prediction"]))

# prediction on test data
pred_rf=pd.DataFrame(rfc.predict(X_Test),columns=["Prediction"])
print("----------------------------Random Forest: Test Data------------------------------------------------\n")
print("Accuracy Score on test data using Random Forest:",accuracy_score(Y_Test,pred_rf["Prediction"]))
print("Balanced Accuracy Score on test data using Random Forest:",balanced_accuracy_score(Y_Test,pred_rf["Prediction"]))
print("Confusion Matrix on test data using Random Forest\n", confusion_matrix(Y_Test,pred_rf["Prediction"]))
print("Classification report on test data using Random Forest\n", classification_report(Y_Test,pred_rf["Prediction"]))


----------------------------Random Forest: Training Data------------------------------------------------

Accuracy Score on training data using Random Forest: 0.9903381642512077
Confusion Matrix on training data using Random Forest
 [[643   4]
 [ 12 997]]
----------------------------Random Forest: Test Data------------------------------------------------

Accuracy Score on test data using Random Forest: 0.855072463768116
Balanced Accuracy Score on test data using Random Forest: 0.8448536518872835
Confusion Matrix on test data using Random Forest
 [[126  31]
 [ 29 228]]
Classification report on test data using Random Forest
               precision    recall  f1-score   support

   Cancelled       0.81      0.80      0.81       157
     Current       0.88      0.89      0.88       257

    accuracy                           0.86       414
   macro avg       0.85      0.84      0.85       414
weighted avg       0.85      0.86      0.85       414



