In [2]:

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

#NLTK-------------------------------
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# Import libraries for feature 
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2


from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,classification_report
from sklearn import metrics
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings("ignore")


from google.colab import drive
drive.mount('/gdrive')
#Change current working directory to gdrive
%cd /gdrive


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Mounted at /gdrive
/gdrive


In [3]:
#Read files
textfile = r'/gdrive/My Drive/Textmining/Comments.csv'
textData = pd.read_csv(textfile) #creates a dataframe

CustInfofile = r'/gdrive/My Drive/Textmining/Customers.csv'
CustInfoData = pd.read_csv(CustInfofile)  #creates a dataframe

print(textData.shape)
print(CustInfoData.shape)


(2070, 2)
(2070, 17)


In [4]:
#Extract target column from Customer Info file
y_train = CustInfoData["TARGET"]
X_train = CustInfoData.drop(columns=["TARGET"]) #extracting training data without the target column
                     
print(X_train.shape)
print(textData.shape)
print(textData.head())
print(y_train)

(2070, 16)
(2070, 2)
     ID                                           Comments
0  1309  Does not like the way the phone works. It is t...
1  3556  Wanted to know the nearest store location. Wan...
2  2230  Wants to know how to do text messaging. Referr...
3  2312  Asked how to disable call waiting. referred hi...
4  3327  Needs help learning how to use the phone. I su...
0       Cancelled
1         Current
2         Current
3         Current
4       Cancelled
          ...    
2065    Cancelled
2066    Cancelled
2067    Cancelled
2068    Cancelled
2069    Cancelled
Name: TARGET, Length: 2070, dtype: object


In [5]:
#Tokenize - Split the sentences to lists of words
textData['CommentsTokenized'] = textData['Comments'].apply(word_tokenize)

export_csv = textData.to_csv(r'/gdrive/My Drive/Textmining/TextDataTokenized_sou.csv')




In [107]:
# Use English stemmer.
stemmer = SnowballStemmer("english")

#Now do stemming - create a new dataframe to store stemmed version
newTextData=pd.DataFrame()
newTextData=textData.drop(columns=["CommentsTokenized","Comments"])
newTextData['CommentsTokenizedStemmed'] = textData['CommentsTokenized'].apply(lambda x: [stemmer.stem(y) for y in x]) # Stem every word.

export_csv = newTextData.to_csv(r'/gdrive/My Drive/Textmining/snowball.csv')


In [108]:
stemmer1 = PorterStemmer()

#Now do stemming - create a new dataframe to store stemmed version
newTextData1=pd.DataFrame()
newTextData1=textData.drop(columns=["CommentsTokenized","Comments"])
newTextData1['CommentsTokenizedStemmed'] = textData['CommentsTokenized'].apply(lambda x: [stemmer1.stem(y) for y in x]) # Stem every word.

export_csv = newTextData1.to_csv(r'/gdrive/My Drive/Textmining/porter.csv')

In [109]:

#using snowball stemmer for further proceedings
#Join stemmed strings
newTextData['CommentsTokenizedStemmed'] = newTextData['CommentsTokenizedStemmed'].apply(lambda x: " ".join(x))

export_csv = newTextData.to_csv(r'/gdrive/My Drive/Textmining/newTextData-Joined_sou.csv')

In [110]:
#Do Bag-Of-Words model - Term - Document Matrix
#Learn the vocabulary dictionary and return term-document matrix.
#count_vect = CountVectorizer(stop_words=None)
count_vect = CountVectorizer(stop_words='english',lowercase=False)
TD_counts = count_vect.fit_transform(newTextData.CommentsTokenizedStemmed)
print(TD_counts.shape)
print(TD_counts.dtype)
print(count_vect.get_feature_names())
#print(TD_counts)
DF_TD_Counts=pd.DataFrame(TD_counts.toarray())
print(DF_TD_Counts)
export_csv = DF_TD_Counts.to_csv(r'/gdrive/My Drive/Textmining/TD_counts-TokenizedStemmed_sou.csv')


(2070, 354)
int64
['3399', '3g', 'abysm', 'access', 'accessori', 'adapt', 'add', 'addit', 'additon', 'address', 'adit', 'adress', 'advertis', 'afraid', 'alway', 'angel', 'angri', 'ani', 'anoth', 'anyth', 'anytim', 'area', 'asap', 'ask', 'bad', 'basic', 'bateri', 'batteri', 'becaus', 'believ', 'better', 'bigger', 'book', 'bought', 'brain', 'bring', 'built', 'busi', 'button', 'buy', 'cancel', 'cancer', 'car', 'care', 'carrier', 'caus', 'cc', 'cell', 'certain', 'chang', 'charg', 'charger', 'check', 'chip', 'citi', 'claim', 'cleariti', 'cold', 'comapr', 'compani', 'compar', 'competit', 'complain', 'complaint', 'concept', 'connect', 'consisit', 'consist', 'constan', 'contact', 'continu', 'contract', 'correct', 'cost', 'coupl', 'cover', 'coverag', 'creat', 'credit', 'cstmer', 'cstmr', 'current', 'cust', 'custom', 'customr', 'date', 'day', 'dead', 'decent', 'defect', 'deo', 'did', 'die', 'differ', 'difficult', 'digiti', 'direct', 'disabl', 'doe', 'don', 'dont', 'drop', 'dure', 'easier', 'effe

In [111]:
#Compute TF-IDF Matrix
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(TD_counts)
print(X_train_tfidf.shape)
DF_TF_IDF=pd.DataFrame(X_train_tfidf.toarray())
print(DF_TF_IDF)
export_csv= DF_TF_IDF.to_csv(r'/gdrive/My Drive/Textmining/TFIDF_counts-TokenizedStemmed_sou.csv')


(2070, 354)
      0    1    2    3        4    5    ...  348  349  350  351  352  353
0     0.0  0.0  0.0  0.0  0.00000  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
1     0.0  0.0  0.0  0.0  0.27568  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
2     0.0  0.0  0.0  0.0  0.00000  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
3     0.0  0.0  0.0  0.0  0.00000  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
4     0.0  0.0  0.0  0.0  0.00000  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
...   ...  ...  ...  ...      ...  ...  ...  ...  ...  ...  ...  ...  ...
2065  0.0  0.0  0.0  0.0  0.00000  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
2066  0.0  0.0  0.0  0.0  0.00000  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
2067  0.0  0.0  0.0  0.0  0.00000  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
2068  0.0  0.0  0.0  0.0  0.00000  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
2069  0.0  0.0  0.0  0.0  0.00000  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0

[2070 rows x 354 columns]


In [115]:
combined=pd.concat([DF_TF_IDF,CustInfoData], axis=1)
print(combined.shape)

(2070, 371)


In [116]:
#Do one Hot encoding for categorical features
X_cat = ["Sex","Status","Car_Owner","Paymethod","LocalBilltype","LongDistanceBilltype"]
#X_cat = combined.select_dtypes(exclude=['int','float64'])
print(X_cat)
combined_one_hot = pd.get_dummies(combined,columns=X_cat)
print(combined_one_hot.shape)
export_csv= combined_one_hot.to_csv(r'/gdrive/My Drive/Textmining/combined_one_hot_sou.csv')

['Sex', 'Status', 'Car_Owner', 'Paymethod', 'LocalBilltype', 'LongDistanceBilltype']
(2070, 379)


In [117]:
#Feature selection
y = combined_one_hot["TARGET"]
x = combined_one_hot.drop(columns=["TARGET"])
combined_features= SelectKBest(score_func=chi2,k=40).fit_transform(x,y)
print(combined_features.shape)

combined_features =pd.DataFrame(combined_features)
print(combined_features)



(2070, 40)
       0    1         2    3    4         5   ...   34   35   36   37   38   39
0     0.0  0.0  0.000000  0.0  0.0  0.000000  ...  0.0  1.0  0.0  1.0  1.0  0.0
1     0.0  0.0  0.000000  0.0  0.0  0.000000  ...  1.0  0.0  0.0  0.0  0.0  1.0
2     0.0  0.0  0.000000  0.0  0.0  0.000000  ...  1.0  0.0  0.0  1.0  0.0  1.0
3     0.0  0.0  0.000000  0.0  0.0  0.000000  ...  0.0  1.0  0.0  1.0  0.0  1.0
4     0.0  0.0  0.000000  0.0  0.0  0.000000  ...  1.0  0.0  0.0  0.0  1.0  0.0
...   ...  ...       ...  ...  ...       ...  ...  ...  ...  ...  ...  ...  ...
2065  0.0  0.0  0.446161  0.0  0.0  0.460113  ...  0.0  1.0  0.0  1.0  0.0  1.0
2066  0.0  0.0  0.000000  0.0  0.0  0.000000  ...  0.0  1.0  1.0  0.0  0.0  1.0
2067  0.0  0.0  0.000000  0.0  0.0  0.000000  ...  1.0  0.0  0.0  0.0  0.0  1.0
2068  0.0  0.0  0.000000  0.0  0.0  0.000000  ...  1.0  0.0  0.0  1.0  0.0  1.0
2069  0.0  0.0  0.000000  0.0  0.0  0.000000  ...  0.0  1.0  0.0  1.0  0.0  1.0

[2070 rows x 40 columns]


In [118]:
#Construct a Random Forest Classifier on text data
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
x_train,x_test,y_train,y_test=train_test_split(combined_features,y,test_size=0.2)

clf=RandomForestClassifier()
RF_text = clf.fit(x_train,y_train)

rf_predictions = clf.predict(x_test)

print("Accuracy score :",accuracy_score(y_test, rf_predictions))
print("Confusion Matrix:")
print(confusion_matrix(y_test, rf_predictions))
print("Classification Report")
print(classification_report(y_test, rf_predictions))


Accuracy score : 0.8671497584541062
Confusion Matrix:
[[126  29]
 [ 26 233]]
Classification Report
              precision    recall  f1-score   support

   Cancelled       0.83      0.81      0.82       155
     Current       0.89      0.90      0.89       259

    accuracy                           0.87       414
   macro avg       0.86      0.86      0.86       414
weighted avg       0.87      0.87      0.87       414



In [119]:
#Do feature selection using a classification model
#clf = ExtraTreesClassifier(n_estimators=50)
clf = GradientBoostingClassifier(n_estimators=50)
#clf = DecisionTreeClassifier()
clf = clf.fit(x_train,y_train)
clf_predictions = clf.predict(x_test)

print("Accuracy score :",accuracy_score(y_test, clf_predictions))
print("Confusion Matrix:")
print(confusion_matrix(y_test, clf_predictions))
print("Classification Report")
print(classification_report(y_test, clf_predictions))


print(clf.feature_importances_)
#model = SelectFromModel(clf, prefit=True)
model = SelectFromModel(clf, prefit=True, max_features=7, threshold=-np.inf)
#model = SelectFromModel(clf, prefit=True)
X_new = model.transform(x_train)
X_new_SelectedFeatures= pd.DataFrame(X_new)
export_csv= X_new_SelectedFeatures.to_csv(r'/gdrive/My Drive/Textmining/X_new_SelectedFeatures_sou.csv')
print(model.get_support())
print(X_new_SelectedFeatures)
#print(X_new_SelectedFeatures.shape)
#print(X_new_SelectedFeatures.head())


Accuracy score : 0.8743961352657005
Confusion Matrix:
[[125  30]
 [ 22 237]]
Classification Report
              precision    recall  f1-score   support

   Cancelled       0.85      0.81      0.83       155
     Current       0.89      0.92      0.90       259

    accuracy                           0.87       414
   macro avg       0.87      0.86      0.86       414
weighted avg       0.87      0.87      0.87       414

[0.00000000e+00 0.00000000e+00 1.28036138e-03 6.52765131e-04
 0.00000000e+00 0.00000000e+00 1.68837434e-03 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 1.29173806e-03 0.00000000e+00
 0.00000000e+00 0.00000000e+00 1.01772141e-03 0.00000000e+00
 1.89364536e-03 0.00000000e+00 0.00000000e+00 1.97498101e-02
 1.77646512e-01 2.04782851e-01 4.64813417e-02 1.17047956e-01
 8.84903427e-02 8.15469429e-02 3.25648349e-02 3.61068472e-03
 1.80248616e-02 2.18169771e-02 5.80271757e-02 6.70940252e-02
 5.08136113e-02 1.5674257

In [120]:
y = combined_one_hot["TARGET"]
x = combined_one_hot.drop(columns=["TARGET"])
combined_features1= SelectKBest(score_func=chi2,k=50).fit_transform(x,y)
print(combined_features1.shape)

combined_features1 =pd.DataFrame(combined_features1)
print(combined_features1)

(2070, 50)
            0    1    2    3         4         5   ...   44   45   46   47   48   49
0     0.000000  0.0  0.0  0.0  0.000000  0.000000  ...  0.0  1.0  0.0  1.0  1.0  0.0
1     0.000000  0.0  0.0  0.0  0.000000  0.000000  ...  1.0  0.0  0.0  0.0  0.0  1.0
2     0.000000  0.0  0.0  0.0  0.000000  0.000000  ...  1.0  0.0  0.0  1.0  0.0  1.0
3     0.000000  0.0  0.0  0.0  0.000000  0.000000  ...  0.0  1.0  0.0  1.0  0.0  1.0
4     0.000000  0.0  0.0  0.0  0.000000  0.000000  ...  1.0  0.0  0.0  0.0  1.0  0.0
...        ...  ...  ...  ...       ...       ...  ...  ...  ...  ...  ...  ...  ...
2065  0.000000  0.0  0.0  0.0  0.000000  0.446161  ...  0.0  1.0  0.0  1.0  0.0  1.0
2066  0.000000  0.0  0.0  0.0  0.000000  0.000000  ...  0.0  1.0  1.0  0.0  0.0  1.0
2067  0.000000  0.0  0.0  0.0  0.000000  0.000000  ...  1.0  0.0  0.0  0.0  0.0  1.0
2068  0.772949  0.0  0.0  0.0  0.545354  0.000000  ...  1.0  0.0  0.0  1.0  0.0  1.0
2069  0.000000  0.0  0.0  0.0  0.000000  0.000000  ...

In [121]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
x_train,x_test,y_train,y_test=train_test_split(combined_features1,y,test_size=0.2)

clf=RandomForestClassifier()
RF_text = clf.fit(x_train,y_train)

rf_predictions = clf.predict(x_test)

print("Accuracy score :",accuracy_score(y_test, rf_predictions))
print("Confusion Matrix:")
print(confusion_matrix(y_test, rf_predictions))
print("Classification Report")
print(classification_report(y_test, rf_predictions))


Accuracy score : 0.8623188405797102
Confusion Matrix:
[[124  30]
 [ 27 233]]
Classification Report
              precision    recall  f1-score   support

   Cancelled       0.82      0.81      0.81       154
     Current       0.89      0.90      0.89       260

    accuracy                           0.86       414
   macro avg       0.85      0.85      0.85       414
weighted avg       0.86      0.86      0.86       414



In [122]:
#Do feature selection using a classification model
#clf = ExtraTreesClassifier(n_estimators=50)
clf = GradientBoostingClassifier(n_estimators=50)
#clf = DecisionTreeClassifier()
clf = clf.fit(x_train,y_train)
clf_predictions = clf.predict(x_test)

print("Accuracy score :",accuracy_score(y_test, clf_predictions))
print("Confusion Matrix:")
print(confusion_matrix(y_test, clf_predictions))
print("Classification Report")
print(classification_report(y_test, clf_predictions))


print(clf.feature_importances_)
#model = SelectFromModel(clf, prefit=True)
model = SelectFromModel(clf, prefit=True, max_features=7, threshold=-np.inf)
#model = SelectFromModel(clf, prefit=True)
X_new = model.transform(x_train)
X_new_SelectedFeatures= pd.DataFrame(X_new)
export_csv= X_new_SelectedFeatures.to_csv(r'/gdrive/My Drive/Textmining/X_new_SelectedFeatures_sou.csv')
print(model.get_support())
print(X_new_SelectedFeatures)
#print(X_new_SelectedFeatures.shape)
#print(X_new_SelectedFeatures.head())


Accuracy score : 0.8405797101449275
Confusion Matrix:
[[124  30]
 [ 36 224]]
Classification Report
              precision    recall  f1-score   support

   Cancelled       0.78      0.81      0.79       154
     Current       0.88      0.86      0.87       260

    accuracy                           0.84       414
   macro avg       0.83      0.83      0.83       414
weighted avg       0.84      0.84      0.84       414

[0.00000000e+00 0.00000000e+00 4.92709038e-04 0.00000000e+00
 3.78547102e-04 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 7.78835919e-04 5.26286697e-04
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 1.85182889e-06 1.83580453e-03 0.00000000e+00 0.00000000e+00
 6.50691515e-04 1.35030663e-03 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 1.27541739e-03 7.43087099e-04 0.00000000e+00 0.00000000e+00
 0.00000000e+00 1.48457314e-02 1.73984747e-01 2.03224963e-01
 4.10967395e-02 1.4084286

In [123]:
y = combined_one_hot["TARGET"]
x = combined_one_hot.drop(columns=["TARGET"])
combined_features2= SelectKBest(score_func=chi2,k=60).fit_transform(x,y)
print(combined_features2.shape)

combined_features2 =pd.DataFrame(combined_features2)
print(combined_features2)

(2070, 60)
            0    1    2    3         4         5   ...   54   55   56   57   58   59
0     0.000000  0.0  0.0  0.0  0.000000  0.000000  ...  0.0  1.0  0.0  1.0  1.0  0.0
1     0.000000  0.0  0.0  0.0  0.000000  0.000000  ...  1.0  0.0  0.0  0.0  0.0  1.0
2     0.000000  0.0  0.0  0.0  0.000000  0.000000  ...  1.0  0.0  0.0  1.0  0.0  1.0
3     0.000000  0.0  0.0  0.0  0.000000  0.000000  ...  0.0  1.0  0.0  1.0  0.0  1.0
4     0.000000  0.0  0.0  0.0  0.000000  0.000000  ...  1.0  0.0  0.0  0.0  1.0  0.0
...        ...  ...  ...  ...       ...       ...  ...  ...  ...  ...  ...  ...  ...
2065  0.000000  0.0  0.0  0.0  0.000000  0.446161  ...  0.0  1.0  0.0  1.0  0.0  1.0
2066  0.000000  0.0  0.0  0.0  0.000000  0.000000  ...  0.0  1.0  1.0  0.0  0.0  1.0
2067  0.000000  0.0  0.0  0.0  0.000000  0.000000  ...  1.0  0.0  0.0  0.0  0.0  1.0
2068  0.772949  0.0  0.0  0.0  0.545354  0.000000  ...  1.0  0.0  0.0  1.0  0.0  1.0
2069  0.000000  0.0  0.0  0.0  0.000000  0.000000  ...

In [124]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
x_train,x_test,y_train,y_test=train_test_split(combined_features2,y,test_size=0.2)

clf=RandomForestClassifier()
RF_text = clf.fit(x_train,y_train)

rf_predictions = clf.predict(x_test)

print("Accuracy score :",accuracy_score(y_test, rf_predictions))
print("Confusion Matrix:")
print(confusion_matrix(y_test, rf_predictions))
print("Classification Report")
print(classification_report(y_test, rf_predictions))


Accuracy score : 0.8792270531400966
Confusion Matrix:
[[146  23]
 [ 27 218]]
Classification Report
              precision    recall  f1-score   support

   Cancelled       0.84      0.86      0.85       169
     Current       0.90      0.89      0.90       245

    accuracy                           0.88       414
   macro avg       0.87      0.88      0.88       414
weighted avg       0.88      0.88      0.88       414



In [125]:
#Do feature selection using a classification model
#clf = ExtraTreesClassifier(n_estimators=50)
clf = GradientBoostingClassifier(n_estimators=50)
#clf = DecisionTreeClassifier()
clf = clf.fit(x_train,y_train)
clf_predictions = clf.predict(x_test)

print("Accuracy score :",accuracy_score(y_test, clf_predictions))
print("Confusion Matrix:")
print(confusion_matrix(y_test, clf_predictions))
print("Classification Report")
print(classification_report(y_test, clf_predictions))


print(clf.feature_importances_)
#model = SelectFromModel(clf, prefit=True)
model = SelectFromModel(clf, prefit=True, max_features=7, threshold=-np.inf)
#model = SelectFromModel(clf, prefit=True)
X_new = model.transform(x_train)
X_new_SelectedFeatures= pd.DataFrame(X_new)
export_csv= X_new_SelectedFeatures.to_csv(r'/gdrive/My Drive/Textmining/X_new_SelectedFeatures_sou.csv')
print(model.get_support())
print(X_new_SelectedFeatures)
#print(X_new_SelectedFeatures.shape)
#print(X_new_SelectedFeatures.head())


Accuracy score : 0.855072463768116
Confusion Matrix:
[[133  36]
 [ 24 221]]
Classification Report
              precision    recall  f1-score   support

   Cancelled       0.85      0.79      0.82       169
     Current       0.86      0.90      0.88       245

    accuracy                           0.86       414
   macro avg       0.85      0.84      0.85       414
weighted avg       0.85      0.86      0.85       414

[0.         0.         0.         0.         0.00112204 0.
 0.         0.00054626 0.         0.00060791 0.         0.
 0.         0.         0.         0.         0.00063862 0.00035792
 0.         0.         0.         0.         0.00229245 0.
 0.         0.         0.         0.00196249 0.         0.
 0.         0.         0.         0.         0.         0.00049521
 0.         0.         0.         0.         0.         0.
 0.00027538 0.01704995 0.15553072 0.14983737 0.04452745 0.14341817
 0.10249767 0.07999459 0.01890756 0.00037148 0.03405387 0.02969745
 0.02255326 

In [126]:
#Sequential Forward Search
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

y = combined_one_hot["TARGET"]
x = combined_one_hot.drop(columns=["TARGET"])

clf = DecisionTreeClassifier()
sfs1 = SFS(clf, 
           k_features=7, 
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='accuracy',
           cv=0)

sfs1 = sfs1.fit(x,y)
sfs1.subsets_


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 378 out of 378 | elapsed:    1.9s finished

[2020-10-31 04:10:47] Features: 1/7 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 377 out of 377 | elapsed:    3.3s finished

[2020-10-31 04:10:50] Features: 2/7 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 376 out of 376 | elapsed:    3.3s finished

[2020-10-31 04:10:53] Features: 3/7 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0

{1: {'avg_score': 1.0,
  'cv_scores': array([1.]),
  'feature_idx': (354,),
  'feature_names': ('ID',)},
 2: {'avg_score': 1.0,
  'cv_scores': array([1.]),
  'feature_idx': (0, 354),
  'feature_names': (0, 'ID')},
 3: {'avg_score': 1.0,
  'cv_scores': array([1.]),
  'feature_idx': (0, 1, 354),
  'feature_names': (0, 1, 'ID')},
 4: {'avg_score': 1.0,
  'cv_scores': array([1.]),
  'feature_idx': (0, 1, 2, 354),
  'feature_names': (0, 1, 2, 'ID')},
 5: {'avg_score': 1.0,
  'cv_scores': array([1.]),
  'feature_idx': (0, 1, 2, 3, 354),
  'feature_names': (0, 1, 2, 3, 'ID')},
 6: {'avg_score': 1.0,
  'cv_scores': array([1.]),
  'feature_idx': (0, 1, 2, 3, 4, 354),
  'feature_names': (0, 1, 2, 3, 4, 'ID')},
 7: {'avg_score': 1.0,
  'cv_scores': array([1.]),
  'feature_idx': (0, 1, 2, 3, 4, 5, 354),
  'feature_names': (0, 1, 2, 3, 4, 5, 'ID')}}

In [127]:
#Sequential forward search result
print(sfs1.k_feature_names_)
print(sfs1.k_score_)

(0, 1, 2, 3, 4, 5, 'ID')
1.0


In [128]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

y = combined_one_hot["TARGET"]
x = combined_one_hot.drop(columns=["TARGET"])

rf = RandomForestClassifier()
sfs2 = SFS(rf, 
           k_features=7, 
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='accuracy',
           cv=0)

sfs2 = sfs2.fit(x,y)
sfs2.subsets_

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 378 out of 378 | elapsed:   54.8s finished

[2020-10-31 04:13:18] Features: 1/7 -- score: 0.9995169082125603[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 377 out of 377 | elapsed:  2.4min finished

[2020-10-31 04:15:41] Features: 2/7 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 376 out of 376 | elapsed:  2.4min finished

[2020-10-31 04:18:04] Features: 3/7 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s 

{1: {'avg_score': 0.9995169082125603,
  'cv_scores': array([0.99951691]),
  'feature_idx': (354,),
  'feature_names': ('ID',)},
 2: {'avg_score': 1.0,
  'cv_scores': array([1.]),
  'feature_idx': (0, 354),
  'feature_names': (0, 'ID')},
 3: {'avg_score': 1.0,
  'cv_scores': array([1.]),
  'feature_idx': (0, 1, 354),
  'feature_names': (0, 1, 'ID')},
 4: {'avg_score': 1.0,
  'cv_scores': array([1.]),
  'feature_idx': (0, 1, 3, 354),
  'feature_names': (0, 1, 3, 'ID')},
 5: {'avg_score': 1.0,
  'cv_scores': array([1.]),
  'feature_idx': (0, 1, 3, 4, 354),
  'feature_names': (0, 1, 3, 4, 'ID')},
 6: {'avg_score': 1.0,
  'cv_scores': array([1.]),
  'feature_idx': (0, 1, 3, 4, 5, 354),
  'feature_names': (0, 1, 3, 4, 5, 'ID')},
 7: {'avg_score': 1.0,
  'cv_scores': array([1.]),
  'feature_idx': (0, 1, 2, 3, 4, 5, 354),
  'feature_names': (0, 1, 2, 3, 4, 5, 'ID')}}

In [129]:
new_features= list(sfs2.k_feature_names_)
x=combined_one_hot[new_features].values
y= combined_one_hot["TARGET"].values

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

clf=RandomForestClassifier()
RF_text = clf.fit(x_train,y_train)

rf_predictions = clf.predict(x_test)

print("Accuracy score :",accuracy_score(y_test, rf_predictions))
print("Confusion Matrix:")
print(confusion_matrix(y_test, rf_predictions))
print("Classification Report")
print(classification_report(y_test, rf_predictions))

clf1=DecisionTreeClassifier()
DF_text = clf1.fit(x_train,y_train)

df_predictions = clf1.predict(x_test)

print("Accuracy score :",accuracy_score(y_test, df_predictions))
print("Confusion Matrix:")
print(confusion_matrix(y_test, df_predictions))
print("Classification Report")
print(classification_report(y_test, df_predictions))



Accuracy score : 0.5797101449275363
Confusion Matrix:
[[ 63  84]
 [ 90 177]]
Classification Report
              precision    recall  f1-score   support

   Cancelled       0.41      0.43      0.42       147
     Current       0.68      0.66      0.67       267

    accuracy                           0.58       414
   macro avg       0.54      0.55      0.55       414
weighted avg       0.58      0.58      0.58       414

Accuracy score : 0.5700483091787439
Confusion Matrix:
[[ 63  84]
 [ 94 173]]
Classification Report
              precision    recall  f1-score   support

   Cancelled       0.40      0.43      0.41       147
     Current       0.67      0.65      0.66       267

    accuracy                           0.57       414
   macro avg       0.54      0.54      0.54       414
weighted avg       0.58      0.57      0.57       414

