##Categorical Feature Importance - Chi Square Test


In [None]:
#Chi Square Test for Categorical features - feature Selection
import pandas as pd
import seaborn as sns
from scipy.stats import chi2_contingency

cat_data = df.select_dtypes(include=['object'])
# Separate your features (X) and target variable (y)
X = cat_data

# Convert your categorical features into numeric using one-hot encoding or label encoding
X_encoded = pd.get_dummies(X)

# Compute the chi-square statistic and p-values for each feature using chi2_contingency
chi2, _, p_values, _ = chi2_contingency(X_encoded)

# Create a DataFrame with the results
results = pd.DataFrame({'feature': X_encoded.columns, 'chi2': chi2, 'p-value': p_values})

# Sort the DataFrame by the p-values in ascending order
results = results.sort_values('p-value')

# Select the top k features with the highest chi-square statistic and p-values
k = 30
selected_features = results[:k]['feature']

# Print the selected features
print(selected_features)

##Categorical & Numerical Feature Importance - ANOVA Test

In [None]:
from sklearn.feature_selection import f_regression, SelectKBest

# Applying SelectKBest class to extract top 20 best features
fs = SelectKBest(score_func=f_regression,k=15)
# Applying feature selection
fit = fs.fit(X_train,y_train)

features_score = pd.DataFrame(fit.scores_)
features = pd.DataFrame(X_train.columns)
feature_score = pd.concat([features,features_score],axis=1)
# Assigning column names
feature_score.columns = ["Input_Features","F_Score"]
print(feature_score.nlargest(15,columns="F_Score"))


##Feature Importance using XGBoost

In [None]:
#xgb with all features on validation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb

xgb_clf = xgb.XGBClassifier()
xgb_clf.fit(X_train, y_train)

importances = xgb_clf.feature_importances_

num_features = X_train.select_dtypes(include=np.number).columns.tolist()
cat_features = X_train.columns.tolist()[len(num_features):]

feature_importances = dict()
for i, col in enumerate(num_features + cat_features):
    if col in cat_features:
        # if the feature is categorical, sum up the importances of its one-hot encoded columns
        cat_col_importance = sum(importances[X_train.columns.str.startswith(col + '_')])
        feature_importances[col] = cat_col_importance
    else:
        feature_importances[col] = importances[i]

# Sort feature importances in descending order
feature_importances = dict(sorted(feature_importances.items(), key=lambda x: x[1], reverse=True))

# Print feature importances
for col, importance in feature_importances.items():
    print(f"{col}: {importance}")

# Create a DataFrame with the feature importances
df_importance = pd.DataFrame(feature_importances.items(), columns=['Feature', 'Importance'])

# Display the DataFrame
display(df_importance)

# Create a horizontal bar chart
plt.barh(range(len(feature_importances)), list(feature_importances.values()), align='center')
plt.yticks(range(len(feature_importances)), list(feature_importances.keys()))
plt.xlabel('Importance')
plt.title('Feature Importance')

# Display the plot
plt.show()

#y_pred = clf.predict(X_val)