In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
 # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

This article covers two fundamental techniques of feature selection:
Filter Methods and Wrapper Methods, as well as how EDA would guide decision making in feature selection.
  *Please visit [Feature Selection and EDA](https://towardsdatascience.com/feature-selection-and-eda-in-python-c6c4eb1058a3?source=post_stats_page-------------------------------------) for detailed code walkthrough*.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.api.types import is_string_dtype, is_numeric_dtype

In [None]:
# import dataset
df = pd.read_csv("../input/credit-card-customers/BankChurners.csv")
df = df.drop(["CLIENTNUM","Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2", "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1"], axis = 1)
df.head()

In [None]:
# describe data
df.describe(include = "all")

In [None]:
# missing values
df.isnull().sum()

# EDA
* univariate analysis
* correlation analysis
* bivariate analysis

In [None]:
# populate list of numerical and categorical variables
num_list = []
cat_list = []

for column in df:
    if is_numeric_dtype(df[column]):
        num_list.append(column)
    elif is_string_dtype(df[column]):
        cat_list.append(column)
        

print("numeric:", num_list)
print("categorical:", cat_list)

In [None]:
# univariate analysis
for column in df:
    plt.figure(column, figsize = (6,6))
    plt.title(column)
    if is_numeric_dtype(df[column]):
        df[column].plot(kind = 'hist')
    elif is_string_dtype(df[column]):
        # show only the TOP 10 value count in each categorical data
        df[column].value_counts()[:10].plot(kind = 'bar')

In [None]:
# correlation analysis
plt.figure(figsize = (20,20))
correlation = df.corr()
sns.heatmap(correlation, cmap = "GnBu", annot = True)

In [None]:
df = df.drop(columns=["Avg_Open_To_Buy", "Total_Trans_Ct", "Customer_Age"])

In [None]:
# grouped bar chart
for i in range(0, len(cat_list)):
    primary_cat = cat_list[i]
    plt.figure (figsize = (8,8))
    chart = sns.countplot(
        data = df,
        x= primary_cat, 
        hue= "Attrition_Flag",
        palette = 'GnBu',
    )

In [None]:
# box plot
for j in range(0, len(num_list)):
    cat = "Attrition_Flag"
    num = num_list[j]
    plt.figure (figsize = (5,5))
    sns.boxplot( x = cat, y = num, data = df, palette = "GnBu")

# Filter Methods
* **chi square, anova and mutual information**
* how does the accuracy changes by chosen score functions and number of variables?
* how does each score function rate each features?

In [None]:
# encode columns
from sklearn.preprocessing import LabelEncoder

for i in cat_list:
    df[i] = LabelEncoder().fit_transform(df[i])

In [None]:
# feature scaling
from mlxtend.preprocessing import minmax_scaling
df_scaled = minmax_scaling(df, columns = df.columns.values)

In [None]:
from sklearn.model_selection import train_test_split
y = df_scaled["Attrition_Flag"]
X = df_scaled.iloc[:, 1:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
print(X.columns.values)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

reg = LogisticRegression()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)

print(X_train.shape)

print(metrics.accuracy_score(y_test, y_pred))

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

# define feature selection - filter methods function
def feature_selection(variable_counts,score_function):
    selection_model = SelectKBest(score_func=score_function, k=variable_counts) # create a selection model based on the score function
    selection = selection_model.fit(X_train, y_train) # fit the selection to the data
    features_selected = X_train.columns[selection.get_support()] # get the selected variables
    X_train_selected = selection_model.fit_transform(X_train, y_train) # transform the trainig data based on selected features
    
    # calculate the accuracy of prediction based on selected features
    reg = LogisticRegression()
    reg.fit(X_train_selected, y_train)
    y_pred = reg.predict(X_test[features_selected])
    
    return metrics.accuracy_score(y_test, y_pred)

In [None]:
# create accuracy chart
accuracy_df = pd.DataFrame({"features count": [], "accuracy": [], "score function": []})
function_list = [chi2, f_classif,mutual_info_classif]
function_name = ["chi square", "anova", "mutual information"]

for j in range(len(function_list)): 
    func = function_list[j]
    func_name = function_name[j]
    for i in range(1, len(df.columns) - 1):
        accuracy = feature_selection(i, func)
        new_record = {"features count": round(i), "accuracy": round(accuracy, 3), "score function": func_name}
        accuracy_df = accuracy_df.append(new_record, ignore_index = True)

print(accuracy_df)
plt.figure(figsize = (10, 10))
sns.lineplot(data = accuracy_df, x = 'features count', y = 'accuracy', hue = 'score function', palette = "GnBu")

In [None]:
import pandas as pd
for i in range(len(function_list)):
    function = function_list[i]
    name = function_name[i]
    selection_model = SelectKBest(score_func = function, k = 8).fit(X_train, y_train)
    feature_score = pd.DataFrame({"features": X_train.columns.values, "scores": selection_model.scores_})
    print(feature_score.sort_values(by = ['scores'], ascending = False))
    print(feature_score.plot(x = "features", kind = 'bar', title = name))

# Wrapper Method
* **forward selection and backward elimination**
* how does the accuracy change by the number of variables?

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

# forward selection
accuracy_df = pd.DataFrame({"features count": [], "accuracy": [], "score function": []})
for i in range(1, len(df.columns) - 1):
    sfs = SFS(LogisticRegression(),
              k_features = i,
              forward = True, 
              scoring = 'accuracy')
    
    X_train_selected = sfs.fit_transform(X_train, y_train)
    reg = LogisticRegression()
    reg.fit(X_train_selected, y_train)
    y_pred = reg.predict(X_test[list(sfs.k_feature_names_)])
    sfs_accuracy = metrics.accuracy_score(y_test, y_pred)
    
    new_record = {"features count": round(i), "accuracy": round(sfs_accuracy, 3), "score function": "forward selection"}
    accuracy_df = accuracy_df.append(new_record, ignore_index = True)

In [None]:
# backward elimination
for i in range(1, len(df.columns) - 1):
    sbs = SFS(LogisticRegression(),
              k_features = i,
              forward = False,
              # floating = False,
              scoring = 'accuracy')

    X_train_selected = sbs.fit_transform(X_train, y_train)
    reg = LogisticRegression()
    reg.fit(X_train_selected, y_train)
    y_pred = reg.predict(X_test[list(sbs.k_feature_names_)])
    sbs_accuracy = metrics.accuracy_score(y_test, y_pred)

    new_record = {"features count": round(i), "accuracy": round(sbs_accuracy, 3), "score function": "backward elimination"}
    accuracy_df = accuracy_df.append(new_record, ignore_index = True)
    
    
print(accuracy_df)
plt.figure(figsize = (10, 10))
sns.lineplot(data = accuracy_df, x = 'features count', y = 'accuracy', hue = 'score function', palette = "GnBu")