In [80]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostRegressor
from sklearn.feature_selection import RFE, SelectFromModel, SelectKBest, chi2

In [81]:
data = pd.read_csv("data/SMS.tsv", delimiter="\t")

In [82]:
vectorizer = TfidfVectorizer(stop_words="english", max_features=50)
X = vectorizer.fit_transform(data["text"].drop(columns=["class"]))
X, Y = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out()), data["class"].apply(str).apply(
    lambda x: int(x == "spam"))

In [83]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

### Фильтрующий метод

In [84]:
words = {i: abs(Y_train.corr(X_train[i])) for i in X_train.columns}
filter_result = [key for key, _ in sorted(words.items(), key=lambda item: item[1], reverse=True)][:30]
print("\n".join(filter_result))

txt
free
claim
mobile
reply
stop
text
new
ur
week
msg
send
gt
lt
ok
phone
ll
come
home
lor
got
later
da
sorry
oh
love
good
going
wat
did


### Метод-обертка

In [85]:
knn = KNeighborsClassifier(n_neighbors=5)
wrapper_result = []
for i in range(30):
    scores = {}
    for chunk in X_train:
        train, test = pd.DataFrame(X_train, columns=wrapper_result), pd.DataFrame(X_test, columns=wrapper_result)
        train[chunk], test[chunk] = X_train[chunk], X_test[chunk]

        knn.fit(train.values, Y_train)

        scores[chunk] = accuracy_score(Y_test, knn.predict(test.values))
    for key, _ in sorted(scores.items(), key=lambda item: item[1], reverse=True):
        if key not in wrapper_result:
            wrapper_result.append(key)
            break

In [86]:
print("\n".join(wrapper_result))

txt
claim
mobile
text
hope
msg
come
gt
stop
going
ll
da
dear
did
free
day
good
great
hey
lor
dont
happy
home
later
love
lt
need
night
oh
ok


### Встроенный метод

In [87]:
rfc = RandomForestClassifier()
rfc.fit(X_train, Y_train)
res_pairs = sorted([(rfc.feature_importances_[i], X.columns[i]) for i in range(len(X.columns))], key=lambda x: x[0],
                   reverse=True)

In [88]:
random_forest_result = [j for _, j in res_pairs][:30]
print("\n".join(random_forest_result))

claim
txt
free
mobile
reply
stop
text
new
send
ur
msg
phone
week
just
time
got
ll
ok
know
pls
like
day
hi
love
today
good
want
lt
gt
don


### Библиотечные методы

In [89]:
rfe = RFE(
    RandomForestClassifier(n_estimators=10, random_state=42),
    n_features_to_select=30,
    step=2,
)
rfe.fit_transform(X_train, Y_train)

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.66833948, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [90]:
rfe_result = X_train.iloc[:, rfe.get_support()].columns.tolist()[:30]
print("\n".join(rfe_result))

claim
come
day
free
good
got
gt
hey
hi
just
know
like
ll
lt
mobile
msg
new
ok
phone
pls
reply
send
stop
text
time
today
txt
ur
want
week


In [91]:
sfm = SelectFromModel(AdaBoostRegressor(random_state=42, n_estimators=30))
sfm.fit_transform(X_train, Y_train)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [92]:
sfm_result = X_train.iloc[:, sfm.get_support()].columns.tolist()[:30]
print("\n".join(sfm_result))

claim
free
ll
mobile
new
reply
send
stop
text
txt
ur


In [93]:
skb = SelectKBest(score_func=chi2, k=30)
skb.fit_transform(X_train, Y_train)

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.66833948, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [94]:
skb_result = X_train.iloc[:, skb.get_support()].columns.tolist()[:30]
print("\n".join(skb_result))

claim
come
da
did
free
going
good
got
gt
home
later
ll
lor
love
lt
mobile
msg
new
oh
ok
phone
reply
send
sorry
stop
text
txt
ur
wat
week


### Сравнение

In [95]:
results = [filter_result, wrapper_result, random_forest_result, rfe_result, sfm_result, skb_result]
names = ["Фильтрующий метод", "Метод-обертка", "Встроенный метод", "RFE", "SelectFromModel", "SelectKBest"]
for i in range(6):
    print(names[i])
    for j in range(6):
        if i != j:
            print(f"\t{names[j]}: {len(set(results[i]).intersection(results[j]))}")
    print()

Фильтрующий метод
	Метод-обертка: 21
	Встроенный метод: 20
	RFE: 20
	SelectFromModel: 11
	SelectKBest: 30

Метод-обертка
	Фильтрующий метод: 21
	Встроенный метод: 14
	RFE: 15
	SelectFromModel: 7
	SelectKBest: 21

Встроенный метод
	Фильтрующий метод: 20
	Метод-обертка: 14
	RFE: 28
	SelectFromModel: 11
	SelectKBest: 20

RFE
	Фильтрующий метод: 20
	Метод-обертка: 15
	Встроенный метод: 28
	SelectFromModel: 11
	SelectKBest: 20

SelectFromModel
	Фильтрующий метод: 11
	Метод-обертка: 7
	Встроенный метод: 11
	RFE: 11
	SelectKBest: 11

SelectKBest
	Фильтрующий метод: 30
	Метод-обертка: 21
	Встроенный метод: 20
	RFE: 20
	SelectFromModel: 11



In [102]:
def calculate_accuracy(clf, res):
    clf.fit(pd.DataFrame(X_train, columns=res).values, Y_train)
    return accuracy_score(Y_test, clf.predict(pd.DataFrame(X_test, columns=res).values))

In [103]:
for i in range(6):
    print(f"{names[i]}: {calculate_accuracy(KNeighborsClassifier(n_neighbors=5), results[i])}")

Фильтрующий метод: 0.9390134529147982
Метод-обертка: 0.9452914798206278
Встроенный метод: 0.9336322869955157
RFE: 0.9345291479820628
SelectFromModel: 0.9354260089686098
SelectKBest: 0.9390134529147982


In [104]:
for i in range(6):
    print(f"{names[i]}: {calculate_accuracy(RandomForestClassifier(n_estimators=10, random_state=42), results[i])}")

Фильтрующий метод: 0.9372197309417041
Метод-обертка: 0.9390134529147982
Встроенный метод: 0.9354260089686098
RFE: 0.9345291479820628
SelectFromModel: 0.9345291479820628
SelectKBest: 0.9372197309417041


In [106]:
for i in range(6):
    print(f"{names[i]}: {calculate_accuracy(SVC(), results[i])}")

Фильтрующий метод: 0.9399103139013453
Метод-обертка: 0.9381165919282511
Встроенный метод: 0.9354260089686098
RFE: 0.9363228699551569
SelectFromModel: 0.9363228699551569
SelectKBest: 0.9399103139013453
