# Sales Feature Selection 

In [62]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, SelectPercentile, SequentialFeatureSelector
from sklearn.feature_selection import f_classif, mutual_info_classif, chi2, f_regression, mutual_info_regression
from sklearn.neighbors import KNeighborsClassifier

## Read the input file

In [63]:
df = pd.read_csv("../data/sales.csv",delimiter=",")
display(df)

Unnamed: 0,division,level of education,training level,work experience,salary,sales
0,computer software,some college,1,5,92766,283647
1,printers,high school,3,7,101828,490163
2,printers,associate's degree,0,10,105433,396790
3,printers,some college,2,6,86490,404898
4,computer hardware,high school,1,7,90531,385136
...,...,...,...,...,...,...
995,computer software,associate's degree,2,10,125808,508916
996,office supplies,some college,1,7,99818,370399
997,office supplies,associate's degree,0,7,82815,232670
998,printers,some college,1,1,59829,204084


## Data Preparation for Regression and Classification 

In [64]:
# Feature Selection for regression problem
df_num = df.select_dtypes(include=['number'])
X=df_num.drop("sales", axis=1)
y=df_num["sales"]

In [65]:
# Feature Selection for classification problem
X_c=df_num
y_c=df["division"]

## Feature Selection based on feature importance

In [66]:
# only for regression
feature_selector = SelectKBest(f_regression, k=2).set_output(transform="pandas")
X_new = feature_selector.fit_transform(X, y)
selected_features = X_new.columns.tolist()
print("Selected features KBest:", selected_features)
feature_selector= SelectPercentile(chi2, percentile=75).set_output(transform="pandas")
X_new = feature_selector.fit_transform(X, y)   
selected_features = X_new.columns.tolist()
print("Selected features percentile chi2:", selected_features)
feature_selector = SelectKBest(mutual_info_regression, k=2).set_output(transform="pandas")
X_new = feature_selector.fit_transform(X, y)
selected_features = X_new.columns.tolist()
print("Selected features KBest info:", selected_features)

Selected features KBest: ['work experience', 'salary']
Selected features percentile chi2: ['work experience', 'salary']
Selected features KBest info: ['work experience', 'salary']


## Wrapper Methods

In [67]:
# Only for classification 
knn = KNeighborsClassifier(n_neighbors=3)

sfs = SequentialFeatureSelector(knn, n_features_to_select=3, direction='forward')
sfs.fit(X_c, y_c)

selected_features = X_c.columns[sfs.get_support()].tolist()
print("Selected features SFS forward:", selected_features)

sfs2 = SequentialFeatureSelector(estimator=KNeighborsClassifier(n_neighbors=3),
                          n_features_to_select=3, direction='backward')
sfs2.fit(X_c, y_c)
selected_features = X_c.columns[sfs2.get_support()].tolist()
print("Selected features SFS backward:", selected_features)

Selected features SFS forward: ['training level', 'work experience', 'sales']
Selected features SFS backward: ['work experience', 'salary', 'sales']
