# For pre-processing functions

# Import Required Libraries

In [1]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import seaborn as sns
from scipy import stats

# To map categorical variables to numbers

In [2]:
data_horse_survival = pd.read_csv("/kaggle/input/horse-survival-dataset/horse.csv")

print(data_horse_survival["outcome"])
data_horse_survival["outcome"] = data_horse_survival["outcome"].map({'died':0,'euthanized':1,'lived':2})
print(data_horse_survival["outcome"])

0            died
1      euthanized
2           lived
3            died
4            died
          ...    
294    euthanized
295    euthanized
296          died
297         lived
298    euthanized
Name: outcome, Length: 299, dtype: object
0      0
1      1
2      2
3      0
4      0
      ..
294    1
295    1
296    0
297    2
298    1
Name: outcome, Length: 299, dtype: int64


# Piepeline for encoding and imputations

In [3]:
#Write custom class to detect Cluster Similarity
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import rbf_kernel

class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, n_init=10,  gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state
        self.n_init = n_init
    
    def fit(self, X, y=None, sample_weight=None):
        self.kmeans_ = KMeans(self.n_clusters, random_state=self.random_state, n_init=self.n_init)
        self.kmeans_.fit(X,sample_weight=sample_weight)
        return self #always return self
    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)
    
    def get_feature_names_out(self, names=None):
        return [f"Cluster {i} similarity"for i in range(self.n_clusters)]
    
    

    
#custom functions for ratio pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler, StandardScaler, FunctionTransformer
from sklearn.compose import make_column_selector

def column_ratio(X):
    return X[:,[0]]/(X[:,[1]]+1e-16)

def ratio_name(function_transformer, feature_names_in):
    return ["ratio"] #feature names out

def ratio_pipeline():
    return make_pipeline(
        SimpleImputer(strategy="median"),
        FunctionTransformer(column_ratio, feature_names_out = ratio_name),
        StandardScaler())

#log pipeline 

log_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    FunctionTransformer(np.log, feature_names_out = "one-to-one"),
    StandardScaler())

#cluster_simil
cluster_simil = ClusterSimilarity(n_clusters=10, n_init=10, gamma=1., random_state=42)

default_num_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    StandardScaler())

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))

preprocessing_complex = ColumnTransformer([
    ("FareByPclass", ratio_pipeline(), ["Fare", "Pclass"]),
    ("AgeBySibSp", ratio_pipeline(), ["Age","SibSp"]),
    ("AgeByParch", ratio_pipeline(), ["Age","Parch"]),
    ("log", log_pipeline, ["Age"]),
    ("geo", cluster_simil, ["Fare"]),
    ("cat", cat_pipeline, make_column_selector(dtype_include=object)),
    
],
remainder = default_num_pipeline)

In [4]:
titanic_data = pd.read_csv("/kaggle/input/titanic-dataset/Titanic-Dataset.csv")

In [5]:
##note modify column names in the function as per your dataset

# to transform and fit data
train_inputs_processed = preprocessing_complex.fit_transform(titanic_data)

#to just transform data
train_inputs_processed = preprocessing_complex.transform(titanic_data)

print(train_inputs_processed.shape)
print(preprocessing_complex.get_feature_names_out())
print(train_inputs_processed.toarray())
#train_inputs_array = train_inputs_processed.toarray()

(891, 1740)
['FareByPclass__ratio' 'AgeBySibSp__ratio' 'AgeByParch__ratio' ...
 'cat__Embarked_S' 'remainder__PassengerId' 'remainder__Survived']
[[-0.44184663 -1.200832   -0.10613411 ...  1.         -1.73010796
  -0.78927234]
 [ 0.90276285 -1.200832    0.86439053 ...  0.         -1.72622007
   1.2669898 ]
 [-0.43745355  0.2928375   0.13649705 ...  1.         -1.72233219
   1.2669898 ]
 ...
 [-0.33641254 -1.200832   -1.44060549 ...  1.          1.72233219
  -0.78927234]
 [ 0.09671333  0.2928375   0.13649705 ...  0.          1.72622007
   1.2669898 ]
 [-0.4385925   0.63753046  0.50044379 ...  0.          1.73010796
  -0.78927234]]
