In [0]:
import pandas as pd
import numpy as np

import nltk
nltk.download('punkt')
# word tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
#stemmers
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
#lematize
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('wordnet')
#term vector and tfidf
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# Import libraries for feature selection - Filter method
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

#traintest split
from sklearn.model_selection import train_test_split

#Models
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,balanced_accuracy_score
from sklearn import metrics

pd.set_option('display.max_colwidth', -2)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [0]:
from google.colab import drive
drive.mount('/gdrive')
#Change current working directory to gdrive
%cd /gdrive

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive


In [0]:
#Read files
textfile = r'/gdrive/My Drive/CIS 508 Python/Assignment-6/Comments.csv'
textData = pd.read_csv(textfile) #creates a dataframe

CustInfofile = r'/gdrive/My Drive/CIS 508 Python/Assignment-6/Customers.csv'
CustInfoData = pd.read_csv(CustInfofile)  #creates a dataframe

print(textData.shape)
print(CustInfoData.shape)
textData

(2070, 2)
(2070, 17)


Unnamed: 0,ID,Comments
0,1309,Does not like the way the phone works. It is to difficult compared to his last phone.
1,3556,Wanted to know the nearest store location. Wants to buy aditional accessories.
2,2230,Wants to know how to do text messaging. Referred him to website.
3,2312,Asked how to disable call waiting. referred him to web site.
4,3327,Needs help learning how to use the phone. I suggested he go back to the store and have the rep teach him.
...,...,...
2065,3034,Needed help figuring out his bill. I explained our minute charges.
2066,271,He lost his phone and called to cancel service. I told him we would suspend until we hear back from him. He will contact us soon.
2067,783,Lost the directions to phone and wants another manual. I referred him to web site.
2068,1295,Wants to change address.


In [0]:
#Extract target column from Customer Info file
y_train = CustInfoData["TARGET"]
X_train = CustInfoData.drop(columns=["TARGET"]) #extracting training data without the target column
                     
print(X_train.shape)
print(textData.shape)
textData.head()
print(y_train)

(2070, 16)
(2070, 2)
0       Cancelled
1       Current  
2       Current  
3       Current  
4       Cancelled
          ...    
2065    Cancelled
2066    Cancelled
2067    Cancelled
2068    Cancelled
2069    Cancelled
Name: TARGET, Length: 2070, dtype: object


# Word Tokenize

In [0]:
#tokenise the sentences into words
textData['CommentsTokenized'] = textData['Comments'].apply(word_tokenize)
print(textData['CommentsTokenized'])

0       [Does, not, like, the, way, the, phone, works, ., It, is, to, difficult, compared, to, his, last, phone, .]                                                       
1       [Wanted, to, know, the, nearest, store, location, ., Wants, to, buy, aditional, accessories, .]                                                                   
2       [Wants, to, know, how, to, do, text, messaging, ., Referred, him, to, website, .]                                                                                 
3       [Asked, how, to, disable, call, waiting, ., referred, him, to, web, site, .]                                                                                      
4       [Needs, help, learning, how, to, use, the, phone, ., I, suggested, he, go, back, to, the, store, and, have, the, rep, teach, him, .]                              
                                                                        ...                                                                      

#Lematize

In [0]:
lem = WordNetLemmatizer()
newTextData=pd.DataFrame()
newTextData=textData.drop(columns=["CommentsTokenized","Comments"])
newTextData['CommentsTokenizedLematize'] = textData['CommentsTokenized'].apply(lambda x: [lem.lemmatize(y) for y in x])
newTextData['CommentsTokenizedLematize']
#Join stemmed strings
newTextData['CommentsTokenizedLematize'] = newTextData['CommentsTokenizedLematize'].apply(lambda x: " ".join(x))
newTextData['CommentsTokenizedLematize']

0       Does not like the way the phone work . It is to difficult compared to his last phone .                                             
1       Wanted to know the nearest store location . Wants to buy aditional accessory .                                                     
2       Wants to know how to do text messaging . Referred him to website .                                                                 
3       Asked how to disable call waiting . referred him to web site .                                                                     
4       Needs help learning how to use the phone . I suggested he go back to the store and have the rep teach him .                        
                                                           ...                                                                             
2065    Needed help figuring out his bill . I explained our minute charge .                                                                
2066    He lost his 

# Creating Vector - Lematize

In [0]:
#Do Bag-Of-Words model - Term - Document Matrix - Lematize
#Learn the vocabulary dictionary and return term-document matrix.
#count_vect = CountVectorizer(stop_words=None)
count_vect = CountVectorizer(stop_words='english',lowercase=False)
count_vect
TD_counts = count_vect.fit_transform(newTextData.CommentsTokenizedLematize)
print("Vocabulary ",(count_vect.vocabulary_))
print("Number of words in vocabulary",len(count_vect.vocabulary_))
print("Shape of text vector",TD_counts.shape)
DF_TD_Counts=pd.DataFrame(TD_counts.toarray(), columns= count_vect.get_feature_names())
print(DF_TD_Counts)

Vocabulary  {'Does': 17, 'like': 247, 'way': 420, 'phone': 295, 'work': 430, 'It': 34, 'difficult': 165, 'compared': 131, 'Wanted': 61, 'know': 242, 'nearest': 271, 'store': 360, 'location': 251, 'Wants': 62, 'buy': 106, 'aditional': 81, 'accessory': 74, 'text': 381, 'messaging': 263, 'Referred': 49, 'website': 423, 'Asked': 6, 'disable': 168, 'waiting': 416, 'referred': 320, 'web': 422, 'site': 351, 'Needs': 44, 'help': 222, 'learning': 246, 'use': 409, 'suggested': 367, 'rep': 322, 'teach': 377, 'Called': 9, 'new': 274, 'plan': 297, 'Might': 39, 'switch': 375, 'soon': 354, 'minute': 265, 'additional': 78, 'access': 73, 'ories': 285, 'Said': 50, 'battery': 94, 'ha': 212, 'worked': 431, 'ASAP': 2, 'He': 26, 'claimed': 126, 'charger': 121, 'really': 313, 'As': 5, 'result': 326, 'wa': 414, 'dying': 176, 'wait': 415, 'current': 154, 'contract': 144, 'batery': 93, 'change': 117, 'ring': 329, 'tone': 391, 'Lost': 38, 'direction': 167, 'want': 417, 'manual': 260, 'number': 277, 'getting': 20

# TF-IDF Matrix 

In [0]:
#Compute TF-IDF Matrix
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(DF_TD_Counts)
print(X_train_tfidf.shape)
DF_TF_IDF=pd.DataFrame(X_train_tfidf.toarray(),columns= DF_TD_Counts.columns)
print(DF_TF_IDF)

(2070, 437)
      3399   3g  ASAP  Also  Angeles  ...  working  worse  worst  wrong  year
0     0.0   0.0  0.0   0.0   0.0      ...  0.0      0.0    0.0    0.0    0.0 
1     0.0   0.0  0.0   0.0   0.0      ...  0.0      0.0    0.0    0.0    0.0 
2     0.0   0.0  0.0   0.0   0.0      ...  0.0      0.0    0.0    0.0    0.0 
3     0.0   0.0  0.0   0.0   0.0      ...  0.0      0.0    0.0    0.0    0.0 
4     0.0   0.0  0.0   0.0   0.0      ...  0.0      0.0    0.0    0.0    0.0 
...   ...   ...  ...   ...   ...      ...  ...      ...    ...    ...    ... 
2065  0.0   0.0  0.0   0.0   0.0      ...  0.0      0.0    0.0    0.0    0.0 
2066  0.0   0.0  0.0   0.0   0.0      ...  0.0      0.0    0.0    0.0    0.0 
2067  0.0   0.0  0.0   0.0   0.0      ...  0.0      0.0    0.0    0.0    0.0 
2068  0.0   0.0  0.0   0.0   0.0      ...  0.0      0.0    0.0    0.0    0.0 
2069  0.0   0.0  0.0   0.0   0.0      ...  0.0      0.0    0.0    0.0    0.0 

[2070 rows x 437 columns]


# Feature Selection - Filter 

In [0]:
#Feature selection
selector = SelectKBest(score_func=chi2, k=30)
selector_fit = selector.fit(DF_TF_IDF,y_train)
#get scores along with features
names = DF_TF_IDF.columns.values[selector.get_support()]
scores = selector.scores_[selector.get_support()]
names_scores = list(zip(names, scores))
ns_df = pd.DataFrame(data = names_scores, columns= ['Feat_names','F_Scores'])
ns_df_sorted = ns_df.sort_values(['F_Scores','Feat_names'], ascending = [False, True])
print(ns_df_sorted)


new_DF_TF_IDF = selector.fit_transform(DF_TF_IDF,y_train)
print("Shape of TF-IDF after feature selection", new_DF_TF_IDF.shape)

DF_TF_IDF_SelectedFeatures= pd.DataFrame(new_DF_TF_IDF, columns = list(DF_TF_IDF.columns[selector.get_support(indices=True)]) )
print(DF_TF_IDF_SelectedFeatures)

      Feat_names  F_Scores
0   Asking        3.050427
7   Transeffered  3.050427
26  screening     2.645482
15  explained     2.528603
5   Needed        2.338845
17  figuring      2.338845
8   Was           2.213835
11  cc            2.213835
20  ot            2.213835
23  received      2.213835
28  turn          2.213835
13  charge        2.130651
10  asked         1.937881
1   Forwarded     1.924960
29  unlimited     1.815572
14  continued     1.683043
18  marketing     1.683043
21  people        1.683043
22  personal      1.683043
24  receiving     1.683043
25  rid           1.683043
27  sold          1.683043
16  family        1.679973
6   She           1.636522
2   Hochie        1.556822
4   Momma         1.556822
19  minute        1.531675
12  change        1.483840
3   Internet      1.440138
9   adress        1.437847
Shape of TF-IDF after feature selection (2070, 30)
      Asking  Forwarded  Hochie  Internet  ...  screening  sold  turn  unlimited
0     0.0     0.0        0.0   

In [0]:
#split data into training and test
#split the entire training data in training and test data
X_Train, X_Test, Y_Train, Y_Test = train_test_split( DF_TF_IDF_SelectedFeatures, y_train, test_size=0.20, random_state=42)
print("Initial shape for entire data:",DF_TF_IDF_SelectedFeatures.shape)
print("Shape of new training data:", X_Train.shape)
print("Shape of new test split data:", X_Test.shape)

Initial shape for entire data: (2070, 30)
Shape of new training data: (1656, 30)
Shape of new test split data: (414, 30)


# Classification model on only comments - Random Forest - Filter

In [0]:
#Construct a Random Forest Classifier on text data
rfc=RandomForestClassifier()
rfc.fit(X_Train,Y_Train)

#Prediction on training data
pred_rf=pd.DataFrame(rfc.predict(X_Train),columns=["Prediction"])
print("----------------------------Random Forest: Training Data------------------------------------------------\n")
print("Accuracy Score on training data using Random Forest:",accuracy_score(Y_Train,pred_rf["Prediction"]))
print("Confusion Matrix on training data using Random Forest\n", confusion_matrix(Y_Train,pred_rf["Prediction"]))

# prediction on test data
pred_rf=pd.DataFrame(rfc.predict(X_Test),columns=["Prediction"])
print("----------------------------Random Forest: Test Data------------------------------------------------\n")
print("Accuracy Score on test data using Random Forest:",accuracy_score(Y_Test,pred_rf["Prediction"]))
print("Balanced Accuracy Score on test data using Random Forest:",balanced_accuracy_score(Y_Test,pred_rf["Prediction"]))
print("Confusion Matrix on test data using Random Forest\n", confusion_matrix(Y_Test,pred_rf["Prediction"]))
print("Classification report on test data using Random Forest\n", classification_report(Y_Test,pred_rf["Prediction"]))


----------------------------Random Forest: Training Data------------------------------------------------

Accuracy Score on training data using Random Forest: 0.6231884057971014
Confusion Matrix on training data using Random Forest
 [[ 41 606]
 [ 18 991]]
----------------------------Random Forest: Test Data------------------------------------------------

Accuracy Score on test data using Random Forest: 0.6256038647342995
Balanced Accuracy Score on test data using Random Forest: 0.5162829314233315
Confusion Matrix on test data using Random Forest
 [[ 10 147]
 [  8 249]]
Classification report on test data using Random Forest
               precision    recall  f1-score   support

   Cancelled       0.56      0.06      0.11       157
     Current       0.63      0.97      0.76       257

    accuracy                           0.63       414
   macro avg       0.59      0.52      0.44       414
weighted avg       0.60      0.63      0.52       414





#Customer data 

In [0]:
CustInfoData = CustInfoData.drop(columns=["TARGET"])
#Do one Hot encoding for categorical features
X_cat = ["Sex","Status","Car_Owner","Paymethod","LocalBilltype","LongDistanceBilltype"]
#X_cat = CustInfoData.select_dtypes(include=['object'])
customer_one_hot = pd.get_dummies(CustInfoData,columns=X_cat)

customer_one_hot = pd.DataFrame(customer_one_hot)
print(customer_one_hot.head())

   ID  ...  LongDistanceBilltype_Standard
0  1   ...  0                            
1  6   ...  1                            
2  8   ...  1                            
3  11  ...  1                            
4  14  ...  0                            

[5 rows x 24 columns]


# Classification model on comments + customer data - Random Forest - Filter

In [0]:
#Merge files comments and customer data
combined=pd.concat([customer_one_hot, DF_TF_IDF_SelectedFeatures], axis=1)
print(combined.shape)
print(combined)

(2070, 54)
        ID  Children  Est_Income   Usage  ...  screening  sold  turn  unlimited
0     1     1         38000.00    229.64  ...  0.0        0.0   0.0   0.0      
1     6     2         29616.00    75.29   ...  0.0        0.0   0.0   0.0      
2     8     0         19732.80    47.25   ...  0.0        0.0   0.0   0.0      
3     11    2         96.33       59.01   ...  0.0        0.0   0.0   0.0      
4     14    2         52004.80    28.14   ...  0.0        0.0   0.0   0.0      
...   ..   ..              ...      ...   ...  ...        ...   ...   ...      
2065  3821  0         78851.30    29.04   ...  0.0        0.0   0.0   0.0      
2066  3822  1         17540.70    36.20   ...  0.0        0.0   0.0   0.0      
2067  3823  0         83891.90    74.40   ...  0.0        0.0   0.0   0.0      
2068  3824  2         28220.80    38.95   ...  0.0        0.0   0.0   0.0      
2069  3825  0         28589.10    100.28  ...  0.0        0.0   0.0   0.0      

[2070 rows x 54 columns]


In [0]:
#split data into training and test
#split the entire training data in training and test data
X_Train, X_Test, Y_Train, Y_Test = train_test_split( combined, y_train, test_size=0.20, random_state=42)
print("Initial shape for entire data:",DF_TF_IDF_SelectedFeatures.shape)
print("Shape of new training data:", X_Train.shape)
print("Shape of new test split data:", X_Test.shape)

Initial shape for entire data: (2070, 30)
Shape of new training data: (1656, 54)
Shape of new test split data: (414, 54)


In [0]:
#Construct a Random Forest Classifier on combined data
rfc=RandomForestClassifier()
rfc.fit(X_Train,Y_Train)

#Prediction on training data
pred_rf=pd.DataFrame(rfc.predict(X_Train),columns=["Prediction"])
print("----------------------------Random Forest: Training Data------------------------------------------------\n")
print("Accuracy Score on training data using Random Forest:",accuracy_score(Y_Train,pred_rf["Prediction"]))
print("Confusion Matrix on training data using Random Forest\n", confusion_matrix(Y_Train,pred_rf["Prediction"]))

# prediction on test data
pred_rf=pd.DataFrame(rfc.predict(X_Test),columns=["Prediction"])
print("----------------------------Random Forest: Test Data------------------------------------------------\n")
print("Accuracy Score on test data using Random Forest:",accuracy_score(Y_Test,pred_rf["Prediction"]))
print("Balanced Accuracy Score on test data using Random Forest:",balanced_accuracy_score(Y_Test,pred_rf["Prediction"]))
print("Confusion Matrix on test data using Random Forest\n", confusion_matrix(Y_Test,pred_rf["Prediction"]))
print("Classification report on test data using Random Forest\n", classification_report(Y_Test,pred_rf["Prediction"]))

----------------------------Random Forest: Training Data------------------------------------------------

Accuracy Score on training data using Random Forest: 0.9885265700483091
Confusion Matrix on training data using Random Forest
 [[641   6]
 [ 13 996]]
----------------------------Random Forest: Test Data------------------------------------------------

Accuracy Score on test data using Random Forest: 0.8719806763285024
Balanced Accuracy Score on test data using Random Forest: 0.8659074574338894
Confusion Matrix on test data using Random Forest
 [[132  25]
 [ 28 229]]
Classification report on test data using Random Forest
               precision    recall  f1-score   support

   Cancelled       0.82      0.84      0.83       157
     Current       0.90      0.89      0.90       257

    accuracy                           0.87       414
   macro avg       0.86      0.87      0.86       414
weighted avg       0.87      0.87      0.87       414





# Feature selection - Wrapper method 

In [0]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
#Construct a Random Forest Classifier on text data
rfc=RandomForestClassifier()

sfs1 = SFS(rfc, 
           k_features=20, 
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='accuracy',
           cv=0)

sfs1 = sfs1.fit(DF_TF_IDF,y_train, custom_feature_names= DF_TF_IDF.columns)
print("Top 20 feature", sfs1.k_feature_names_)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 437 out of 437 | elapsed:    9.2s finished

[2019-12-11 02:38:41] Features: 1/20 -- score: 0.6227053140096618[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 436 out of 436 | elapsed:   10.3s finished

[2019-12-11 02:38:52] Features: 2/20 -- score: 0.6270531400966184[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 435 out of 435 | elapsed:   10.4s finished

[2019-12-11 02:39:02] Features: 3/20 -- score: 0.6294685990338165[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  

Top 20 feature ('3399', '3g', 'ASAP', 'Also', 'Angeles', 'As', 'Asking', 'CC', 'Called', 'Can', 'Cant', 'Contacting', 'Deos', 'Even', 'He', 'Hochie', 'Wanted', 'change', 'end', 'work')


[Parallel(n_jobs=1)]: Done 418 out of 418 | elapsed:   12.9s finished

[2019-12-11 02:42:27] Features: 20/20 -- score: 0.6352657004830918

In [0]:
#top 20 features after Forward
print("Top 20 feature", sfs1.k_feature_names_)

DF_TF_IDF_SelectedFeatures = DF_TF_IDF.loc[:, DF_TF_IDF.columns.isin(list(sfs1.k_feature_names_))]
print("Shape of TF-IDF after feature selection", DF_TF_IDF_SelectedFeatures.shape)
print(DF_TF_IDF_SelectedFeatures)

Top 20 feature ('3399', '3g', 'ASAP', 'Also', 'Angeles', 'As', 'Asking', 'CC', 'Called', 'Can', 'Cant', 'Contacting', 'Deos', 'Even', 'He', 'Hochie', 'Wanted', 'change', 'end', 'work')
Shape of TF-IDF after feature selection (2070, 20)
      3399   3g  ASAP  Also  Angeles  ...  Hochie    Wanted    change  end      work
0     0.0   0.0  0.0   0.0   0.0      ...  0.0     0.000000  0.000000  0.0  0.239865
1     0.0   0.0  0.0   0.0   0.0      ...  0.0     0.265243  0.000000  0.0  0.000000
2     0.0   0.0  0.0   0.0   0.0      ...  0.0     0.000000  0.000000  0.0  0.000000
3     0.0   0.0  0.0   0.0   0.0      ...  0.0     0.000000  0.000000  0.0  0.000000
4     0.0   0.0  0.0   0.0   0.0      ...  0.0     0.000000  0.000000  0.0  0.000000
...   ...   ...  ...   ...   ...      ...  ...          ...       ...  ...       ...
2065  0.0   0.0  0.0   0.0   0.0      ...  0.0     0.000000  0.000000  0.0  0.000000
2066  0.0   0.0  0.0   0.0   0.0      ...  0.0     0.000000  0.000000  0.0  0.000000

In [0]:
#split data into training and test
#split the entire training data in training and test data
X_Train, X_Test, Y_Train, Y_Test = train_test_split( DF_TF_IDF_SelectedFeatures, y_train, test_size=0.20, random_state=42)
print("Initial shape for entire data:",DF_TF_IDF_SelectedFeatures.shape)
print("Shape of new training data:", X_Train.shape)
print("Shape of new test split data:", X_Test.shape)

Initial shape for entire data: (2070, 20)
Shape of new training data: (1656, 20)
Shape of new test split data: (414, 20)


# Classification on comments - Wrapper method

In [0]:
#Construct a Random Forest Classifier on text data
rfc=RandomForestClassifier()
rfc.fit(X_Train,Y_Train)

#Prediction on training data
pred_rf=pd.DataFrame(rfc.predict(X_Train),columns=["Prediction"])
print("----------------------------Random Forest: Training Data------------------------------------------------\n")
print("Accuracy Score on training data using Random Forest:",accuracy_score(Y_Train,pred_rf["Prediction"]))
print("Confusion Matrix on training data using Random Forest\n", confusion_matrix(Y_Train,pred_rf["Prediction"]))

# prediction on test data
pred_rf=pd.DataFrame(rfc.predict(X_Test),columns=["Prediction"])
print("----------------------------Random Forest: Test Data------------------------------------------------\n")
print("Accuracy Score on test data using Random Forest:",accuracy_score(Y_Test,pred_rf["Prediction"]))
print("Balanced Accuracy Score on test data using Random Forest:",balanced_accuracy_score(Y_Test,pred_rf["Prediction"]))
print("Confusion Matrix on test data using Random Forest\n", confusion_matrix(Y_Test,pred_rf["Prediction"]))
print("Classification report on test data using Random Forest\n", classification_report(Y_Test,pred_rf["Prediction"]))

----------------------------Random Forest: Training Data------------------------------------------------

Accuracy Score on training data using Random Forest: 0.6328502415458938
Confusion Matrix on training data using Random Forest
 [[ 86 561]
 [ 47 962]]
----------------------------Random Forest: Test Data------------------------------------------------

Accuracy Score on test data using Random Forest: 0.6159420289855072
Balanced Accuracy Score on test data using Random Forest: 0.517175146843788
Confusion Matrix on test data using Random Forest
 [[ 17 140]
 [ 19 238]]
Classification report on test data using Random Forest
               precision    recall  f1-score   support

   Cancelled       0.47      0.11      0.18       157
     Current       0.63      0.93      0.75       257

    accuracy                           0.62       414
   macro avg       0.55      0.52      0.46       414
weighted avg       0.57      0.62      0.53       414





# Classification on customers + comments - Wrapper method

In [0]:
#Merge files comments and customer data
combined=pd.concat([customer_one_hot, DF_TF_IDF_SelectedFeatures], axis=1)
print(combined.shape)
print(combined)

(2070, 44)
        ID  Children  Est_Income   Usage  ...    Wanted    change  end      work
0     1     1         38000.00    229.64  ...  0.000000  0.000000  0.0  0.239865
1     6     2         29616.00    75.29   ...  0.265243  0.000000  0.0  0.000000
2     8     0         19732.80    47.25   ...  0.000000  0.000000  0.0  0.000000
3     11    2         96.33       59.01   ...  0.000000  0.000000  0.0  0.000000
4     14    2         52004.80    28.14   ...  0.000000  0.000000  0.0  0.000000
...   ..   ..              ...      ...   ...       ...       ...  ...       ...
2065  3821  0         78851.30    29.04   ...  0.000000  0.000000  0.0  0.000000
2066  3822  1         17540.70    36.20   ...  0.000000  0.000000  0.0  0.000000
2067  3823  0         83891.90    74.40   ...  0.000000  0.000000  0.0  0.000000
2068  3824  2         28220.80    38.95   ...  0.000000  0.546959  0.0  0.000000
2069  3825  0         28589.10    100.28  ...  0.000000  0.000000  0.0  0.000000

[2070 rows x 44 

In [0]:
#split data into training and test
#split the entire training data in training and test data
X_Train, X_Test, Y_Train, Y_Test = train_test_split( combined, y_train, test_size=0.20, random_state=42)
print("Initial shape for entire data:",DF_TF_IDF_SelectedFeatures.shape)
print("Shape of new training data:", X_Train.shape)
print("Shape of new test split data:", X_Test.shape)

Initial shape for entire data: (2070, 20)
Shape of new training data: (1656, 44)
Shape of new test split data: (414, 44)


In [0]:
#Construct a Random Forest Classifier on combined data
rfc=RandomForestClassifier()
rfc.fit(X_Train,Y_Train)

#Prediction on training data
pred_rf=pd.DataFrame(rfc.predict(X_Train),columns=["Prediction"])
print("----------------------------Random Forest: Training Data------------------------------------------------\n")
print("Accuracy Score on training data using Random Forest:",accuracy_score(Y_Train,pred_rf["Prediction"]))
print("Confusion Matrix on training data using Random Forest\n", confusion_matrix(Y_Train,pred_rf["Prediction"]))

# prediction on test data
pred_rf=pd.DataFrame(rfc.predict(X_Test),columns=["Prediction"])
print("----------------------------Random Forest: Test Data------------------------------------------------\n")
print("Accuracy Score on test data using Random Forest:",accuracy_score(Y_Test,pred_rf["Prediction"]))
print("Balanced Accuracy Score on test data using Random Forest:",balanced_accuracy_score(Y_Test,pred_rf["Prediction"]))
print("Confusion Matrix on test data using Random Forest\n", confusion_matrix(Y_Test,pred_rf["Prediction"]))
print("Classification report on test data using Random Forest\n", classification_report(Y_Test,pred_rf["Prediction"]))


----------------------------Random Forest: Training Data------------------------------------------------

Accuracy Score on training data using Random Forest: 0.9903381642512077
Confusion Matrix on training data using Random Forest
 [[642   5]
 [ 11 998]]
----------------------------Random Forest: Test Data------------------------------------------------

Accuracy Score on test data using Random Forest: 0.8526570048309179
Balanced Accuracy Score on test data using Random Forest: 0.8466256908473568
Confusion Matrix on test data using Random Forest
 [[129  28]
 [ 33 224]]
Classification report on test data using Random Forest
               precision    recall  f1-score   support

   Cancelled       0.80      0.82      0.81       157
     Current       0.89      0.87      0.88       257

    accuracy                           0.85       414
   macro avg       0.84      0.85      0.84       414
weighted avg       0.85      0.85      0.85       414



