In [1]:
from google.colab import drive
drive.mount('/gdrive')
#Change current working directory to gdrive
%cd /gdrive


Mounted at /gdrive
/gdrive


In [2]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel


#NLTK-------------------------------
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# Import libraries for feature 
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.model_selection import train_test_split


from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,classification_report
from sklearn import metrics
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
#Read files
textfile = r'/gdrive/My Drive/TextMining/Comments.csv'
textData = pd.read_csv(textfile) #creates a dataframe

CustInfofile = r'/gdrive/My Drive/TextMining/Customers.csv'
CustInfoData = pd.read_csv(CustInfofile)  #creates a dataframe

print(textData.shape)
print(CustInfoData.shape)


(2070, 2)
(2070, 17)


In [4]:
print(textData.head)
print(CustInfoData.head)

<bound method NDFrame.head of         ID                                           Comments
0     1309  Does not like the way the phone works. It is t...
1     3556  Wanted to know the nearest store location. Wan...
2     2230  Wants to know how to do text messaging. Referr...
3     2312  Asked how to disable call waiting. referred hi...
4     3327  Needs help learning how to use the phone. I su...
...    ...                                                ...
2065  3034  Needed help figuring out his bill. I explained...
2066   271  He lost his phone and called to cancel service...
2067   783  Lost the directions to phone and wants another...
2068  1295                           Wants to change address.
2069  1807  He lost his phone and called to cancel service...

[2070 rows x 2 columns]>
<bound method NDFrame.head of         ID Sex Status  Children  Est_Income Car_Owner   Usage        Age  \
0        1   F      S         1    38000.00         N  229.64  24.393333   
1        6   M    

In [5]:
#Extract target column from Customer Info file
y_train = CustInfoData["TARGET"]
X_train = CustInfoData.drop(columns=["TARGET"]) #extracting training data without the target column
                     
print(X_train.shape)
print(textData.shape)
print(textData.head())
print(y_train)

(2070, 16)
(2070, 2)
     ID                                           Comments
0  1309  Does not like the way the phone works. It is t...
1  3556  Wanted to know the nearest store location. Wan...
2  2230  Wants to know how to do text messaging. Referr...
3  2312  Asked how to disable call waiting. referred hi...
4  3327  Needs help learning how to use the phone. I su...
0       Cancelled
1         Current
2         Current
3         Current
4       Cancelled
          ...    
2065    Cancelled
2066    Cancelled
2067    Cancelled
2068    Cancelled
2069    Cancelled
Name: TARGET, Length: 2070, dtype: object


In [6]:
#Tokenize - Split the sentences to lists of words
textData['CommentsTokenized'] = textData['Comments'].apply(word_tokenize)

export_csv = textData.to_csv(r'/gdrive/My Drive/TextMining/TextDataTokenized1.csv')




In [7]:
# Use English stemmer.
stemmer = SnowballStemmer("english")

#Now do stemming - create a new dataframe to store stemmed version
newTextData=pd.DataFrame()
newTextData=textData.drop(columns=["CommentsTokenized","Comments"])
newTextData['CommentsTokenizedStemmed'] = textData['CommentsTokenized'].apply(lambda x: [stemmer.stem(y) for y in x]) # Stem every word.
# we get a column where words have been stemmed, saving it to new file where words are tokenized and stemmed
export_csv = newTextData.to_csv(r'/gdrive/My Drive/TextMining/newTextDataTS_SNOWBALL.csv')

##comment on 'disable' being 'disabl' in the stemmed words list, misspelled words "batteri" "bateri" etc
## we end up with lot of garbage words


In [8]:
## use another stemmer now - snowball used earlier 
# use porter stemmer 

ps = PorterStemmer()

newTextData=pd.DataFrame()
newTextData=textData.drop(columns=["CommentsTokenized","Comments"])
newTextData['CommentsTokenizedStemmed'] = textData['CommentsTokenized'].apply(lambda x: [ps.stem(y) for y in x]) # Stem every word.
# we get a column where words have been stemmed, saving it to new file where words are tokenized and stemmed
export_csv = newTextData.to_csv(r'/gdrive/My Drive/TextMining/newTextDataTS_PORTER.csv')

There is only a little difference in the working of these two. Words like 'fairly' and 'sportingly' were stemmed to 'fair' and 'sport' in the snowball stemmer but when you use the porter stemmer they are stemmed to 'fairli' and 'sportingli'.O

Snowbal stemmer is basically a better version of the porter stemmer
so we go ahead with snowball stemmer stemmed strings


In [9]:
#Join stemmed strings
newTextData['CommentsTokenizedStemmed'] = newTextData['CommentsTokenizedStemmed'].apply(lambda x: " ".join(x))

export_csv = newTextData.to_csv(r'/gdrive/My Drive/TextMining/newTextData-Joined.csv')

In [10]:
# term document matrix has been constructed here

#Do Bag-Of-Words model - Term - Document Matrix
#Learn the vocabulary dictionary and return term-document matrix.
#count_vect = CountVectorizer(stop_words=None)

#stop words have been eliminated here

count_vect = CountVectorizer(stop_words='english',lowercase=False) # gets rid of stopwords here as well as does counting #point 2 of todo
TD_counts = count_vect.fit_transform(newTextData.CommentsTokenizedStemmed) 
#after we drop stop words we end up 354 words and the words as shown below, some misspelled etc 
# we can clean things up on our own if we want to
print(TD_counts.shape)
print(TD_counts.dtype)
print(count_vect.get_feature_names())
#print(TD_counts)
## does counting for us and now once we get here we get structured data
DF_TD_Counts=pd.DataFrame(TD_counts.toarray())
print(DF_TD_Counts)
export_csv = DF_TD_Counts.to_csv(r'/gdrive/My Drive/TextMining/TD_counts-TokenizedStemmed.csv')

## TD_counts-TokenizedStemmed is now in structured form and we got count


(2070, 358)
int64
['3399', '3g', 'abysm', 'access', 'accessori', 'adapt', 'add', 'addit', 'additon', 'address', 'adit', 'adress', 'advertis', 'afraid', 'alway', 'angel', 'angri', 'ani', 'anoth', 'anyth', 'anytim', 'area', 'asap', 'ask', 'bad', 'basic', 'bateri', 'batteri', 'becaus', 'believ', 'better', 'bigger', 'book', 'bought', 'brain', 'bring', 'built', 'busi', 'button', 'buy', 'cancel', 'cancer', 'car', 'care', 'carrier', 'caus', 'cc', 'cell', 'certain', 'chang', 'charg', 'charger', 'check', 'chip', 'citi', 'claim', 'cleariti', 'cold', 'comapr', 'compani', 'compar', 'competit', 'complain', 'complaint', 'concept', 'connect', 'consisit', 'consist', 'constanli', 'contact', 'continu', 'contract', 'correct', 'cost', 'coupl', 'cover', 'coverag', 'creat', 'credit', 'cstmer', 'cstmr', 'current', 'cust', 'custom', 'customr', 'date', 'day', 'dead', 'decent', 'defect', 'deo', 'did', 'die', 'differ', 'difficult', 'digiti', 'direct', 'disabl', 'doe', 'don', 'dont', 'drop', 'dure', 'easier', 'ef

In [11]:
#Compute TF-IDF Matrix
# TFIDF matrix has been computed here
tfidf_transformer = TfidfTransformer() #weighting of words is done here
X_train_tfidf = tfidf_transformer.fit_transform(TD_counts) 
print(X_train_tfidf.shape)
DF_TF_IDF=pd.DataFrame(X_train_tfidf.toarray())
print(DF_TF_IDF)
export_csv= DF_TF_IDF.to_csv(r'/gdrive/My Drive/TextMining/TFIDF_counts-TokenizedStemmed.csv')
# still 354 features and we get word weightings, earlier it was purely counts

(2070, 358)
      0    1    2    3        4    5    6    7    8         9    ...  348  \
0     0.0  0.0  0.0  0.0  0.00000  0.0  0.0  0.0  0.0  0.000000  ...  0.0   
1     0.0  0.0  0.0  0.0  0.27568  0.0  0.0  0.0  0.0  0.000000  ...  0.0   
2     0.0  0.0  0.0  0.0  0.00000  0.0  0.0  0.0  0.0  0.000000  ...  0.0   
3     0.0  0.0  0.0  0.0  0.00000  0.0  0.0  0.0  0.0  0.000000  ...  0.0   
4     0.0  0.0  0.0  0.0  0.00000  0.0  0.0  0.0  0.0  0.000000  ...  0.0   
...   ...  ...  ...  ...      ...  ...  ...  ...  ...       ...  ...  ...   
2065  0.0  0.0  0.0  0.0  0.00000  0.0  0.0  0.0  0.0  0.000000  ...  0.0   
2066  0.0  0.0  0.0  0.0  0.00000  0.0  0.0  0.0  0.0  0.000000  ...  0.0   
2067  0.0  0.0  0.0  0.0  0.00000  0.0  0.0  0.0  0.0  0.000000  ...  0.0   
2068  0.0  0.0  0.0  0.0  0.00000  0.0  0.0  0.0  0.0  0.772949  ...  0.0   
2069  0.0  0.0  0.0  0.0  0.00000  0.0  0.0  0.0  0.0  0.000000  ...  0.0   

      349  350       351  352  353  354  355  356  357  
0     

In [12]:
#Now combine the TF-IDF matrix with Customer data. Then do one-hot encoding on the categorical variables.
combined_DF_TF_IDF=pd.concat([CustInfoData,DF_TF_IDF], axis=1)

In [13]:
#one hot encoding

#Do one Hot encoding for categorical features
## One hot encoding 
X_cat = ["Sex","Status","Car_Owner","Paymethod","LocalBilltype","LongDistanceBilltype"]
#X_cat = combined.select_dtypes(exclude=['int','float64'])
print(X_cat)
combined_one_hot = pd.get_dummies(combined_DF_TF_IDF,columns=X_cat)
print(combined_one_hot.shape)
export_csv= combined_one_hot.to_csv(r'/gdrive/My Drive/TextMining/combined_one_hot.csv')

['Sex', 'Status', 'Car_Owner', 'Paymethod', 'LocalBilltype', 'LongDistanceBilltype']
(2070, 383)


In [14]:

X_trainf = combined_one_hot.drop(columns=["TARGET"]) #extracting training data without the target column
y_trainf = combined_one_hot["TARGET"]
                     
print(X_trainf.shape)
print(y_trainf)

(2070, 382)
0       Cancelled
1         Current
2         Current
3         Current
4       Cancelled
          ...    
2065    Cancelled
2066    Cancelled
2067    Cancelled
2068    Cancelled
2069    Cancelled
Name: TARGET, Length: 2070, dtype: object


In [15]:
#Feature selection FILTER METHOD
#Suppose, we select 25 features with top 25 Fisher scores
selector = SelectKBest(k=25)  ## try with top 50 and so son
#selector = SelectKBest(score_func=chi2, k=50)

#new_DF_TF_IDF = SelectKBest(score_func=chi2, k=50).fit_transform(DF_TF_IDF,y_train)
new_DF_TF_IDF = selector.fit_transform(X_trainf,y_trainf) #pulling top 25 features
print(new_DF_TF_IDF.shape)

feature_names_out = selector.get_support(indices=True)
print(feature_names_out)

DF_TF_IDF_SelectedFeatures= pd.DataFrame(new_DF_TF_IDF)
print(DF_TF_IDF_SelectedFeatures)

export_csv= DF_TF_IDF_SelectedFeatures.to_csv(r'/gdrive/My Drive/TextMining/TFIDF_counts-Selected Features.csv')


(2070, 25)
[  1   2   3   6   7   8  24  80 131 199 225 230 252 262 286 327 333 368
 369 371 372 375 376 380 381]
       0         1       2      3     4       5    6    7        8    9   ...  \
0     1.0  38000.00  229.64  23.56  0.00  206.08  0.0  0.0  0.00000  0.0  ...   
1     2.0  29616.00   75.29  29.78  0.00   45.50  0.0  0.0  0.00000  0.0  ...   
2     0.0  19732.80   47.25  24.81  0.00   22.44  0.0  0.0  0.00000  0.0  ...   
3     2.0     96.33   59.01  26.13  0.00   32.88  0.0  0.0  0.00000  0.0  ...   
4     2.0  52004.80   28.14   5.03  0.00   23.11  0.0  0.0  0.00000  0.0  ...   
...   ...       ...     ...    ...   ...     ...  ...  ...      ...  ...  ...   
2065  0.0  78851.30   29.04   0.37  0.00   28.66  0.0  0.0  0.44341  0.0  ...   
2066  1.0  17540.70   36.20  22.17  0.57   13.45  0.0  0.0  0.00000  0.0  ...   
2067  0.0  83891.90   74.40  28.92  0.00   45.47  0.0  0.0  0.00000  0.0  ...   
2068  2.0  28220.80   38.95  26.49  0.00   12.46  0.0  0.0  0.00000  0.0  ..

In [16]:
feature_index = selector.get_support(True)
print ("feature index =", feature_index)

feature index = [  1   2   3   6   7   8  24  80 131 199 225 230 252 262 286 327 333 368
 369 371 372 375 376 380 381]


In [17]:
#Feature selection 50 features
#Suppose, we select 50 features with top 50 Fisher scores
selector = SelectKBest(k=50)
#selector = SelectKBest(score_func=chi2, k=30)

#new_DF_TF_IDF = SelectKBest(score_func=chi2, k=50).fit_transform(DF_TF_IDF,y_train)
new_DF = selector.fit_transform(X_trainf,y_trainf)
print(new_DF.shape)

feature_names_out = selector.get_support(indices=True)
print(feature_names_out)

DF_TF_IDF_SelectedFeatures50= pd.DataFrame(new_DF)
print(DF_TF_IDF_SelectedFeatures50)


export_csv= DF_TF_IDF_SelectedFeatures50.to_csv(r'/gdrive/My Drive/TextMining/TFIDF_counts-Selected Features_50.csv')

(2070, 50)
[  1   2   3   6   7   8   9  10  24  45  59  60  61  72  80  91 125 128
 131 140 185 199 203 225 226 230 234 235 248 252 257 261 262 267 272 286
 321 327 329 333 352 356 368 369 371 372 375 376 380 381]
       0         1       2      3     4       5    6    7    8    9   ...  \
0     1.0  38000.00  229.64  23.56  0.00  206.08  0.0  0.0  0.0  0.0  ...   
1     2.0  29616.00   75.29  29.78  0.00   45.50  0.0  0.0  0.0  0.0  ...   
2     0.0  19732.80   47.25  24.81  0.00   22.44  0.0  0.0  0.0  0.0  ...   
3     2.0     96.33   59.01  26.13  0.00   32.88  1.0  0.0  0.0  0.0  ...   
4     2.0  52004.80   28.14   5.03  0.00   23.11  0.0  0.0  0.0  0.0  ...   
...   ...       ...     ...    ...   ...     ...  ...  ...  ...  ...  ...   
2065  0.0  78851.30   29.04   0.37  0.00   28.66  0.0  0.0  0.0  0.0  ...   
2066  1.0  17540.70   36.20  22.17  0.57   13.45  0.0  0.0  0.0  0.0  ...   
2067  0.0  83891.90   74.40  28.92  0.00   45.47  0.0  0.0  0.0  0.0  ...   
2068  2.0  2822

In [18]:
feature_index = selector.get_support(True)
print ("feature index =", feature_index)

feature index = [  1   2   3   6   7   8   9  10  24  45  59  60  61  72  80  91 125 128
 131 140 185 199 203 225 226 230 234 235 248 252 257 261 262 267 272 286
 321 327 329 333 352 356 368 369 371 372 375 376 380 381]


In [20]:
#Do feature selection - Wrapper using a Decision Tree classification model

dt = DecisionTreeClassifier(max_depth=5, random_state=0)

dt = dt.fit(X_trainf,y_trainf)
#print(clf.feature_importances_)

model = SelectFromModel(dt, prefit=True, max_features=10, threshold=-np.inf)
#model = SelectFromModel(clf, prefit=True)
X_new2 = model.transform(X_trainf)
X_new_SelectedFeaturesdt= pd.DataFrame(X_new2)
export_csv= X_new_SelectedFeaturesdt.to_csv(r'/gdrive/My Drive/TextMining/dt_selectedfeatures_wrapper.csv')
#print(model.get_support())
print(X_new_SelectedFeaturesdt)

           0    1         2          3      4     5       6    7    8    9
0        1.0  1.0  38000.00  24.393333  23.56  0.00  206.08  0.0  1.0  0.0
1        6.0  2.0  29616.00  49.426667  29.78  0.00   45.50  1.0  0.0  0.0
2        8.0  0.0  19732.80  50.673333  24.81  0.00   22.44  1.0  0.0  0.0
3       11.0  2.0     96.33  56.473333  26.13  0.00   32.88  0.0  1.0  0.0
4       14.0  2.0  52004.80  25.140000   5.03  0.00   23.11  1.0  0.0  0.0
...      ...  ...       ...        ...    ...   ...     ...  ...  ...  ...
2065  3821.0  0.0  78851.30  48.373333   0.37  0.00   28.66  0.0  1.0  0.0
2066  3822.0  1.0  17540.70  62.786667  22.17  0.57   13.45  0.0  1.0  1.0
2067  3823.0  0.0  83891.90  61.020000  28.92  0.00   45.47  1.0  0.0  0.0
2068  3824.0  2.0  28220.80  38.766667  26.49  0.00   12.46  1.0  0.0  0.0
2069  3825.0  0.0  28589.10  15.600000  13.19  0.00   87.09  0.0  1.0  0.0

[2070 rows x 10 columns]


In [21]:
#Do feature selection - Wrapper using a RF classification model

rf = RandomForestClassifier(max_depth=5, random_state=0)

rf = rf.fit(X_trainf,y_trainf)
#print(clf.feature_importances_)

model = SelectFromModel(rf, prefit=True, max_features=10, threshold=-np.inf)
#model = SelectFromModel(clf, prefit=True)
X_new = model.transform(X_trainf)
X_new_SelectedFeaturesRF= pd.DataFrame(X_new)
export_csv= X_new_SelectedFeaturesRF.to_csv(r'/gdrive/My Drive/TextMining/rf_selectedfeatures_wrapper.csv')
#print(model.get_support())
print(X_new_SelectedFeaturesRF)

        0         1       2          3      4     5       6    7    8    9
0     1.0  38000.00  229.64  24.393333  23.56  0.00  206.08  0.0  0.0  1.0
1     2.0  29616.00   75.29  49.426667  29.78  0.00   45.50  1.0  1.0  0.0
2     0.0  19732.80   47.25  50.673333  24.81  0.00   22.44  1.0  1.0  0.0
3     2.0     96.33   59.01  56.473333  26.13  0.00   32.88  1.0  0.0  1.0
4     2.0  52004.80   28.14  25.140000   5.03  0.00   23.11  0.0  1.0  0.0
...   ...       ...     ...        ...    ...   ...     ...  ...  ...  ...
2065  0.0  78851.30   29.04  48.373333   0.37  0.00   28.66  0.0  0.0  1.0
2066  1.0  17540.70   36.20  62.786667  22.17  0.57   13.45  0.0  0.0  1.0
2067  0.0  83891.90   74.40  61.020000  28.92  0.00   45.47  0.0  1.0  0.0
2068  2.0  28220.80   38.95  38.766667  26.49  0.00   12.46  0.0  1.0  0.0
2069  0.0  28589.10  100.28  15.600000  13.19  0.00   87.09  0.0  0.0  1.0

[2070 rows x 10 columns]


In [22]:
model.get_support()
#Get column names
cols = model.get_support(indices=True) #get column indices
print("\n cols = ", cols, "\n")


 cols =  [  1   2   3   4   6   7   8 369 371 372] 



In [23]:
#Model 1
#feature selection k=25
from sklearn.model_selection import train_test_split
X_train25, X_test25, y_train25, y_test25 = train_test_split(DF_TF_IDF_SelectedFeatures,y_trainf, test_size=0.20, random_state=1)

In [24]:
rf = RandomForestClassifier()
rf.fit(X_train25, y_train25)
rf_predict = rf.predict(X_test25)

print("Test Accuracy:", metrics.accuracy_score(y_test25,rf_predict))
print("Confusion Matrix for Random Forest:")
print(confusion_matrix(y_test25,rf_predict))
print('Printing the precision and recall, among other metrics')
print(metrics.classification_report(y_test25,rf_predict))

Test Accuracy: 0.8647342995169082
Confusion Matrix for Random Forest:
[[124  26]
 [ 30 234]]
Printing the precision and recall, among other metrics
              precision    recall  f1-score   support

   Cancelled       0.81      0.83      0.82       150
     Current       0.90      0.89      0.89       264

    accuracy                           0.86       414
   macro avg       0.85      0.86      0.85       414
weighted avg       0.87      0.86      0.87       414



In [25]:
dt = DecisionTreeClassifier()
dt.fit(X_train25, y_train25)
dt_predict = dt.predict(X_test25)

print("Test Accuracy:", metrics.accuracy_score(y_test25,dt_predict))
print("Confusion Matrix for Random Forest:")
print(confusion_matrix(y_test25,dt_predict))
print('Printing the precision and recall, among other metrics')
print(metrics.classification_report(y_test25,dt_predict))

Test Accuracy: 0.8599033816425121
Confusion Matrix for Random Forest:
[[131  19]
 [ 39 225]]
Printing the precision and recall, among other metrics
              precision    recall  f1-score   support

   Cancelled       0.77      0.87      0.82       150
     Current       0.92      0.85      0.89       264

    accuracy                           0.86       414
   macro avg       0.85      0.86      0.85       414
weighted avg       0.87      0.86      0.86       414



In [26]:
# with 50 features
X_train50, X_test50, y_train50, y_test50 = train_test_split(DF_TF_IDF_SelectedFeatures50,y_trainf, test_size=0.20, random_state=1)

In [27]:
rf = RandomForestClassifier()
rf.fit(X_train50, y_train50)
rf_predict = rf.predict(X_test50)

print("Test Accuracy:", metrics.accuracy_score(y_test50,rf_predict))
print("Confusion Matrix for Decision Tree:")
print(confusion_matrix(y_test50,rf_predict))
print('Printing the precision and recall, among other metrics')
print(metrics.classification_report(y_test50,rf_predict))

Test Accuracy: 0.8623188405797102
Confusion Matrix for Decision Tree:
[[124  26]
 [ 31 233]]
Printing the precision and recall, among other metrics
              precision    recall  f1-score   support

   Cancelled       0.80      0.83      0.81       150
     Current       0.90      0.88      0.89       264

    accuracy                           0.86       414
   macro avg       0.85      0.85      0.85       414
weighted avg       0.86      0.86      0.86       414



In [28]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train50, y_train50)
dtc_predict =dtc.predict(X_test50)

print("Test Accuracy:", metrics.accuracy_score(y_test50,dtc_predict))
print("Confusion Matrix for Decision Tree:")
print(confusion_matrix(y_test50,dtc_predict))
print('Printing the precision and recall, among other metrics')
print(metrics.classification_report(y_test50,dtc_predict))

Test Accuracy: 0.8429951690821256
Confusion Matrix for Decision Tree:
[[126  24]
 [ 41 223]]
Printing the precision and recall, among other metrics
              precision    recall  f1-score   support

   Cancelled       0.75      0.84      0.79       150
     Current       0.90      0.84      0.87       264

    accuracy                           0.84       414
   macro avg       0.83      0.84      0.83       414
weighted avg       0.85      0.84      0.84       414



In [29]:
## WRAPPER METHOD Random Forest 
X_trainRF, X_testRF, y_trainRF, y_testRF = train_test_split(X_new_SelectedFeaturesRF,y_train, test_size=0.20, random_state=1)

In [30]:
rf = RandomForestClassifier()
rf.fit(X_trainRF, y_trainRF)
rf_predict = rf.predict(X_testRF)

print("Test Accuracy:", metrics.accuracy_score(y_testRF,rf_predict))
print("Confusion Matrix for Decision Tree:")
print(confusion_matrix(y_testRF,rf_predict))
print('Printing the precision and recall, among other metrics')
print(metrics.classification_report(y_testRF,rf_predict))

Test Accuracy: 0.8743961352657005
Confusion Matrix for Decision Tree:
[[126  24]
 [ 28 236]]
Printing the precision and recall, among other metrics
              precision    recall  f1-score   support

   Cancelled       0.82      0.84      0.83       150
     Current       0.91      0.89      0.90       264

    accuracy                           0.87       414
   macro avg       0.86      0.87      0.86       414
weighted avg       0.88      0.87      0.87       414



In [31]:
dt = DecisionTreeClassifier()
dt.fit(X_trainRF, y_trainRF)
dt_predict = dt.predict(X_testRF)

print("Test Accuracy:", metrics.accuracy_score(y_testRF,dt_predict))
print("Confusion Matrix for Decision Tree:")
print(confusion_matrix(y_testRF,rf_predict))
print('Printing the precision and recall, among other metrics')
print(metrics.classification_report(y_testRF,rf_predict))

Test Accuracy: 0.8647342995169082
Confusion Matrix for Decision Tree:
[[126  24]
 [ 28 236]]
Printing the precision and recall, among other metrics
              precision    recall  f1-score   support

   Cancelled       0.82      0.84      0.83       150
     Current       0.91      0.89      0.90       264

    accuracy                           0.87       414
   macro avg       0.86      0.87      0.86       414
weighted avg       0.88      0.87      0.87       414



In [32]:
## WRAPPER METHOD DT
X_trainDT, X_testDT, y_trainDT, y_testDT = train_test_split(X_new_SelectedFeaturesdt,y_train, test_size=0.20, random_state=1)

In [33]:
dtc = DecisionTreeClassifier()
dtc.fit(X_trainDT, y_trainDT)
rf_predict = rf.predict(X_testDT)

print("Test Accuracy:", metrics.accuracy_score(y_testDT,rf_predict))
print("Confusion Matrix for Decision Tree:")
print(confusion_matrix(y_testDT,rf_predict))
print('Printing the precision and recall, among other metrics')
print(metrics.classification_report(y_testDT,rf_predict))

Test Accuracy: 0.6328502415458938
Confusion Matrix for Decision Tree:
[[ 19 131]
 [ 21 243]]
Printing the precision and recall, among other metrics
              precision    recall  f1-score   support

   Cancelled       0.47      0.13      0.20       150
     Current       0.65      0.92      0.76       264

    accuracy                           0.63       414
   macro avg       0.56      0.52      0.48       414
weighted avg       0.59      0.63      0.56       414



In [34]:
rf = RandomForestClassifier()
rf.fit(X_trainDT, y_train25)
rf_predict = rf.predict(X_testRF)

print("Test Accuracy:", metrics.accuracy_score(y_testDT,rf_predict))
print("Confusion Matrix for Decision Tree:")
print(confusion_matrix(y_testDT,rf_predict))
print('Printing the precision and recall, among other metrics')
print(metrics.classification_report(y_testDT,rf_predict))

Test Accuracy: 0.6304347826086957
Confusion Matrix for Decision Tree:
[[  3 147]
 [  6 258]]
Printing the precision and recall, among other metrics
              precision    recall  f1-score   support

   Cancelled       0.33      0.02      0.04       150
     Current       0.64      0.98      0.77       264

    accuracy                           0.63       414
   macro avg       0.49      0.50      0.40       414
weighted avg       0.53      0.63      0.51       414

