In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2, mutual_info_classif, f_classif

df = pd.DataFrame({
    "category": ["A", "B", "A", "C", "B"],
    "age": [25, 32, 47, 51, 23],
    "salary": [30000, 70000, 90000, 110000, 28000],
    "date": pd.to_datetime(["2020-03-01","2021-05-02","2018-07-15","2019-11-30","2022-01-01"]),
    "review": ["good product", "bad quality", "excellent", "not good", "worth it"],
    "target": [1, 0, 1, 0, 1]
})

y = df['target']
X = df.drop(columns=['target'])

X['salary_div_1000'] = X['salary'] / 1000
X['age_plus_salary'] = X['age'] + X['salary']
X['age_group'] = (X['age'] > 30).astype(int)

X['category_le'] = LabelEncoder().fit_transform(X['category'])

X['year'] = X['date'].dt.year
X['month'] = X['date'].dt.month

tfidf = TfidfVectorizer(max_features=3)
text_feats = tfidf.fit_transform(X['review']).toarray()
tfidf_df = pd.DataFrame(text_feats, columns=tfidf.get_feature_names_out())

X = pd.concat([X.reset_index(drop=True), tfidf_df], axis=1)

print("\nFinal Feature DataFrame:")
print(X.head())

X = X.drop(columns=["review", "date", "category", "year", "month", "salary_div_1000", "age_plus_salary"])



Final Feature DataFrame:
  category  age  salary       date        review  salary_div_1000  \
0        A   25   30000 2020-03-01  good product             30.0   
1        B   32   70000 2021-05-02   bad quality             70.0   
2        A   47   90000 2018-07-15     excellent             90.0   
3        C   51  110000 2019-11-30      not good            110.0   
4        B   23   28000 2022-01-01      worth it             28.0   

   age_plus_salary  age_group  category_le  year  month  bad  excellent  good  
0            30025          0            0  2020      3  0.0        0.0   1.0  
1            70032          1            1  2021      5  1.0        0.0   0.0  
2            90047          1            0  2018      7  0.0        1.0   0.0  
3           110051          1            2  2019     11  0.0        0.0   1.0  
4            28023          0            1  2022      1  0.0        0.0   0.0  


In [4]:
# Feature Selection
# Correlation
print("\nCorrelation Matrix:")
print(X.corr())

# Chi-Square
chi_scores, p_values = chi2(X, y)
print("\nChi-Square Scores:", chi_scores)

# Mutual Info
mi_scores = mutual_info_classif(X, y)
print("\nMutual Information Scores:", mi_scores)

# ANOVA / F-Test
f_scores, p_values = f_classif(X, y)
print("\nANOVA F-Scores:", f_scores)



Correlation Matrix:
                  age    salary  age_group  category_le       bad  excellent  \
age          1.000000  0.970397   0.829928     0.341914 -0.157725   0.499462   
salary       0.970397  1.000000   0.920726     0.457832  0.067783   0.375885   
age_group    0.829928  0.920726   1.000000     0.327327  0.408248   0.408248   
category_le  0.341914  0.457832   0.327327     1.000000  0.133631  -0.534522   
bad         -0.157725  0.067783   0.408248     0.133631  1.000000  -0.250000   
excellent    0.499462  0.375885   0.408248    -0.534522 -0.250000   1.000000   
good         0.171709  0.110688  -0.166667     0.218218 -0.408248  -0.408248   

                 good  
age          0.171709  
salary       0.110688  
age_group   -0.166667  
category_le  0.218218  
bad         -0.408248  
excellent   -0.408248  
good         1.000000  

Chi-Square Scores: [3.25936330e+00 3.02520325e+04 8.88888889e-01 2.04166667e+00
 1.50000000e+00 6.66666667e-01 8.33333333e-02]

Mutual Informatio