
# Santander Customer Satisfaction Dataset Preprocessing



In [None]:
#Data Collection
import pandas as pd
df = pd.read_csv("Santander Customer Satisfaction_train.csv")
print(df.head(5))


   ID  var3  var15  imp_ent_var16_ult1  imp_op_var39_comer_ult1  \
0   1     2     23                 0.0                      0.0   
1   3     2     34                 0.0                      0.0   
2   4     2     23                 0.0                      0.0   
3   8     2     37                 0.0                    195.0   
4  10     2     39                 0.0                      0.0   

   imp_op_var39_comer_ult3  imp_op_var40_comer_ult1  imp_op_var40_comer_ult3  \
0                      0.0                      0.0                      0.0   
1                      0.0                      0.0                      0.0   
2                      0.0                      0.0                      0.0   
3                    195.0                      0.0                      0.0   
4                      0.0                      0.0                      0.0   

   imp_op_var40_efect_ult1  imp_op_var40_efect_ult3  ...  \
0                      0.0                      0.0  ...

In [None]:
#Handling null values
# Can use ffill but may lead to biased prediction
# So, let's separate categorical and numerical columns

print("Datatypes group by:")
print(df.columns.to_series().groupby(df.dtypes).apply(list))

# First separate target variable to avoid losing it in get_dummies
y = df["TARGET"]
X = df.drop("TARGET", axis=1)

Datatypes group by:
int64      [ID, var3, var15, ind_var1_0, ind_var1, ind_va...
float64    [imp_ent_var16_ult1, imp_op_var39_comer_ult1, ...
dtype: object


In [None]:
# Identify numerical and categorical features
num_fea = X.select_dtypes(include=["number"]).columns.tolist()
cat_fea = X.select_dtypes(include=["object", "category"]).columns.tolist()


In [None]:
# Fill missing values
X[num_fea] = X[num_fea].fillna(X[num_fea].median())
X[cat_fea] = X[cat_fea].fillna("Unknown")

In [None]:
# Convert categorical to dummies
X = pd.get_dummies(X, columns=cat_fea, drop_first=True)

# If there is an ID column or similar, drop it
if "ID" in X.columns:
    X.drop(columns=["ID"], inplace=True)

In [None]:
#Data Splitting
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)
# stratify=y maintains the same class distribution in train and test sets.
# random_state=42 is just a popular arbitrary choice among programmers

# Feature Selection

## Filter Methods for Feature Selection

### Basic Filter Methods
1. Constant Features Removal  
2. Quasi-Constant Features Removal  
3. Duplicate Features Removal  

### Statistical Filter Methods
1. Pearson Correlation  
2. Spearman Rank Correlation  
3. ANOVA F-test  
4. Mutual Information (regression)  
5. Chi-square Test  
6. Mutual Information (classification)  
7. Cramér’s V

In [None]:
#Basic filter methods
#1.Removing Constant features
const = []
for features in X_train:
  if(X_train[features].std()==0):
    const.append(features)
print("Number of constant features:",len(const))
X_train.drop(labels=const,axis=1,inplace=True)
X_test.drop(labels=const,axis=1,inplace=True)

#2.Removing quasi constant features
quasi_constant = []
for feature in X_train.columns:
  predominant = (X_train[feature].value_counts()/float(len(X_train))).sort_values(ascending=False).values[0]
  if(predominant>0.999):
    quasi_constant.append(feature)
print("Number of quasi constant features:",len(quasi_constant))
X_train.drop(labels=quasi_constant,axis=1,inplace=True)
X_test.drop(labels=quasi_constant,axis=1,inplace=True)#Apply same removal to X_test

#3.Duplicated features
duplicates = []
for i in range(len(X_train.columns)):
  col1 = X_train.columns[i]
  for col2 in X_train.columns[i+1:]:
    if(X_train[col1].equals(X_train[col2])): #Not ==, as it won't return a single True of False
      duplicates.append(col2)
print("Number of duplicate features:",len(duplicates))
X_train.drop(labels=duplicates,axis=1,inplace=True)
X_test.drop(labels=duplicates,axis=1,inplace=True)#Apply same removal to X_test


Number of constant features: 57
Number of quasi constant features: 89
Number of duplicate features: 10


In [None]:
#Statistical Filter Methods
#Even though we’ve already one-hot encoded everything, statistical filter methods still differ based on original feature type
#Target is Survived-Categorical, and this is a classification problem
#Numerical features
num_features=[col for col in X_train.columns if not set(X_train[col].unique()).issubset({0,1})]
cat_features = [col for col in X_train.columns if set(X_train[col].unique()).issubset({0,1})]
#Input numerical and output categorical --> Anova or Kendalls
from sklearn.feature_selection import f_classif,SelectKBest
#1. Anova
f_values,p_values = f_classif(X_train[num_features],y_train)
anova_df=pd.DataFrame({
    "Numerical Features":num_features,
    "F_values":f_values,
    "P_values":p_values
    })
anova_df.sort_values(by="P_values",inplace=True)
significant_numeric_features = anova_df[anova_df["P_values"]<0.05]["Numerical Features"].tolist()
print("Selected Numerical Features:", significant_numeric_features)
#Input categorical and output categroical --> Chi2 or Mutual Info
#2.chi2
from sklearn.feature_selection import chi2
chi2_values,p_values=chi2(X_train[cat_features],y_train)
chi2_df=pd.DataFrame({
    "Categorical Features":cat_features,
    "Chi2_values":chi2_values,
    "p_values":p_values
     })
chi2_df.sort_values(by="p_values",inplace=True)
significant_chi2_features=chi2_df[chi2_df["p_values"]<0.05]["Categorical Features"].tolist()
print("Selected Features from Chi2 test:", significant_chi2_features)
#3. Mutual info
from sklearn.feature_selection import mutual_info_classif,SelectKBest
#We explored manual filtering(MI>0) so far, another way is by selecting K best features
selector = SelectKBest(score_func=mutual_info_classif,k=10)
#Fit on our dataset
selector.fit(X_train[cat_features],y_train)
#We can use transform on selector to directly transform the dataset or can extract the features like we did before
significant_mi_features=X_train[cat_features].columns[selector.get_support()].tolist()
print("Selected Features from MI test:",significant_mi_features)

#Numerical features, Categorical Features(from both chi2 and mi) --> union all to filter from dataset
final_selected_features=list(set(significant_numeric_features+significant_chi2_features+significant_mi_features)) #Set conversion is for removing duplicates, converting to list is to use it as indexer
#Filter
X_train=X_train[final_selected_features]
# Ensure that X_test has the same columns as X_train after feature selection and encoding.
# Some dummy variables or features may be missing in X_test if certain categories
# are not present in the test set. To avoid errors during model prediction,
# we add these missing columns filled with zeros to X_test,
# then reorder columns to exactly match X_train.
for col in X_train.columns:
    if col not in X_test.columns:
        X_test[col] = 0
X_test = X_test[X_train.columns]

Selected Numerical Features: ['num_meses_var5_ult3', 'num_var30', 'num_var5', 'num_var42', 'var15', 'var36', 'num_var4', 'num_var35', 'num_var8_0', 'num_var13', 'num_var13_0', 'saldo_var30', 'num_meses_var13_corto_ult3', 'num_var13_corto', 'num_var13_corto_0', 'num_var12', 'imp_op_var39_efect_ult1', 'num_var5_0', 'imp_op_var41_efect_ult1', 'num_var8', 'num_var24', 'num_var30_0', 'num_meses_var12_ult3', 'num_var22_ult1', 'saldo_var13', 'num_var24_0', 'num_meses_var8_ult3', 'imp_op_var39_efect_ult3', 'num_var41_0', 'imp_op_var41_ult1', 'imp_op_var39_ult1', 'imp_op_var41_efect_ult3', 'num_aport_var13_hace3', 'num_var39_0', 'saldo_var13_corto', 'saldo_medio_var13_corto_ult1', 'num_op_var39_efect_ult1', 'saldo_medio_var13_corto_ult3', 'num_op_var41_efect_ult1', 'saldo_var42', 'num_var26_0', 'num_var25_0', 'saldo_medio_var13_corto_hace2', 'imp_aport_var13_hace3', 'num_op_var39_efect_ult3', 'saldo_var12', 'saldo_var24', 'num_var12_0', 'num_op_var41_efect_ult3', 'saldo_medio_var12_ult3', 'sald

In [None]:
#Dataset after data preprocessing and feature selection
print(X_train.columns)

Index(['saldo_medio_var13_corto_ult1', 'num_var45_ult1',
       'saldo_medio_var5_hace2', 'num_var35', 'num_var24_0', 'saldo_var42',
       'imp_op_var41_efect_ult3', 'ind_var8_0', 'num_op_var39_ult3',
       'imp_aport_var13_ult1',
       ...
       'num_var8', 'num_var43_recib_ult1', 'num_var1', 'ind_var41_0',
       'num_var5_0', 'ind_var24', 'var38', 'ind_var14_0', 'num_op_var39_ult1',
       'num_op_var39_efect_ult3'],
      dtype='object', length=121)
