In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

In [2]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import set_config

In [3]:
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier, StackingClassifier
from xgboost import XGBClassifier

In [4]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [5]:
data = pd.read_csv('../input/train.csv', index_col='ID')
data.sort_index(axis=0, inplace=True)

In [6]:
data.shape
data.head()

(8068, 10)

Unnamed: 0_level_0,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
458982,Male,Yes,61,Yes,Executive,1.0,High,3.0,Cat_6,C
458983,Female,Yes,63,Yes,Executive,0.0,High,5.0,Cat_6,C
458984,Male,Yes,39,Yes,Artist,0.0,Average,3.0,Cat_6,C
458985,Male,No,23,No,Healthcare,1.0,Low,4.0,Cat_6,D
458986,Male,No,18,No,Healthcare,7.0,Low,4.0,Cat_6,D


In [7]:
X = data.iloc[ : , :-1]
y = data.iloc[ :, -1]

In [8]:
y

ID
458982    C
458983    C
458984    C
458985    D
458986    D
         ..
467969    C
467971    D
467972    D
467973    A
467974    B
Name: Segmentation, Length: 8068, dtype: object

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

In [10]:
X_train.dtypes

Gender              object
Ever_Married        object
Age                  int64
Graduated           object
Profession          object
Work_Experience    float64
Spending_Score      object
Family_Size        float64
Var_1               object
dtype: object

In [11]:
X_train.isna().sum()
X_test.isna().sum()

Gender               0
Ever_Married       102
Age                  0
Graduated           58
Profession          94
Work_Experience    680
Spending_Score       0
Family_Size        264
Var_1               57
dtype: int64

Gender               0
Ever_Married        38
Age                  0
Graduated           20
Profession          30
Work_Experience    149
Spending_Score       0
Family_Size         71
Var_1               19
dtype: int64

In [12]:
mean_imputer = SimpleImputer(strategy='mean')
median_imputer = SimpleImputer(strategy='median')
mode_imputer = SimpleImputer(strategy='most_frequent')
unknown_imputer = SimpleImputer(strategy='constant', fill_value='Unknown')

In [13]:
std_scalar = StandardScaler()
onehot_encoder = OneHotEncoder(drop="first", handle_unknown='error', sparse=False)
label_encoder = LabelEncoder()

In [14]:
XGBClassifier().get_params()

{'objective': 'binary:logistic',
 'base_score': None,
 'booster': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'gamma': None,
 'gpu_id': None,
 'importance_type': 'gain',
 'interaction_constraints': None,
 'learning_rate': None,
 'max_delta_step': None,
 'max_depth': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [15]:
randomforest = RandomForestClassifier(
    n_jobs=-1, random_state=4, verbose=4, n_estimators=2000, class_weight='balanced', max_depth=7, max_features=0.9
)
extratrees = ExtraTreesClassifier(
    n_jobs=-1, class_weight="balanced", random_state=4, verbose=4, n_estimators=2000, max_depth=7, max_features=0.9
)
svc = SVC(C=1, kernel="linear", random_state=4, verbose=1, class_weight='balanced')

xgboost = XGBClassifier(
    objective="multi:softmax", num_class=4, random_state=4, verbosity=1,  #num_parallel_tree=500, 
    n_estimators=500
)

In [16]:
stack_clf = StackingClassifier(
    estimators=[("rfc", randomforest), ("xgb", xgboost),],
    final_estimator=extratrees,
    n_jobs=-1,
    verbose=1,
)

bag_clf = BaggingClassifier(
    base_estimator=randomforest, n_estimators=5, oob_score=True, n_jobs=-1, verbose=1
)

In [17]:
X_train.dtypes

Gender              object
Ever_Married        object
Age                  int64
Graduated           object
Profession          object
Work_Experience    float64
Spending_Score      object
Family_Size        float64
Var_1               object
dtype: object

In [18]:
# class CategoricalTransformer( BaseEstimator, TransformerMixin ):
#     #Class constructor method that takes in a list of values as its argument
#     def __init__(self, cat_features):
#         self._cat_features = cat_features
        
#     #Return self nothing else to do here
#     def fit( self, X, y = None  ):
#         return self

#     #Transformer method we wrote for this transformer 
#     def transform(self, X , y = None ):
#        #Depending on constructor argument break dates column into specified units
#        #using the helper functions written above 
#        for feature in self._cat_features:
#            if feature == 'Var_1':
#                mode_imputer.fit(X[[feature]])
#                X[feature] = mode_imputer.transform(X[[feature]])
#             else :
#                 unknown_imputer.fo
#        return X.values 
#work in progress

In [19]:
categorical_feature_mode = ['Var_1']
categorical_feature_unknown = ['Gender', 'Ever_Married', 'Graduated', 'Profession', 'Spending_Score']
numerical_features = ['Age', 'Work_Experience', 'Family_Size']

In [20]:
numerical_transformer = Pipeline(
    steps=[("mean_imputer", mean_imputer), ("scaler", std_scalar)], verbose=True
)
categorical_transformer_unknown = Pipeline(steps=[("unknown_imputer", unknown_imputer), ("onehot", onehot_encoder)], verbose=True)

categorical_transformer_mode = Pipeline(steps=[("mode_imputer", mode_imputer), ("onehot", onehot_encoder)], verbose=True)

In [21]:
#%%
preprocessor = ColumnTransformer(
    transformers=[
        (
            "numerical_transformation_pipeline",
            numerical_transformer,
            numerical_features,
        ),
        (
            "categorical_transformation_pipeline_unknown",
            categorical_transformer_unknown,
            categorical_feature_unknown,
        ),
        (
            "categorical_transformation_pipeline_mode",
            categorical_transformer_mode,
            categorical_feature_mode,
        ),
        
    ],
    verbose=True,
)

In [22]:
randomforest_pipeline = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", randomforest)], verbose=True
)
randomforest_pipeline = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", randomforest)], verbose=True
)

extratrees_pipeline = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", extratrees)], verbose=True
)

svc_pipeline = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", svc)], verbose=True
)

xgboost_pipeline = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", xgboost)], verbose=True
)

In [23]:
stacking_pipeline = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", stack_clf)], verbose=True
)
bagging_pipeline = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", bag_clf)], verbose=True
)

In [24]:
set_config(display='diagram')

In [25]:
randomforest_pipeline
extratrees_pipeline
svc_pipeline
xgboost_pipeline
stacking_pipeline
bagging_pipeline

In [26]:
set_config(display='text')

In [27]:
label_encoder.fit(y_train)
y_train = label_encoder.transform(y_train)
y_test = label_encoder.transform(y_test)

LabelEncoder()

In [28]:
randomforest_pipeline.fit(X_train, y_train)
y_pred = randomforest_pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

g tree 1270 of 2000
building tree 1271 of 2000
building tree 1272 of 2000
building tree 1273 of 2000
building tree 1274 of 2000
building tree 1275 of 2000
building tree 1276 of 2000building tree 1277 of 2000

building tree 1278 of 2000building tree 1279 of 2000building tree 1280 of 2000

building tree 1281 of 2000
building tree 1282 of 2000
building tree 1283 of 2000
building tree 1284 of 2000

building tree 1285 of 2000
building tree 1286 of 2000
building tree 1287 of 2000
building tree 1288 of 2000
building tree 1289 of 2000building tree 1290 of 2000

building tree 1291 of 2000
building tree 1292 of 2000
building tree 1293 of 2000
building tree 1294 of 2000
building tree 1295 of 2000building tree 1296 of 2000
building tree 1297 of 2000

building tree 1298 of 2000
building tree 1299 of 2000
building tree 1300 of 2000
building tree 1301 of 2000building tree 1302 of 2000

building tree 1303 of 2000
building tree 1304 of 2000
building tree 1305 of 2000building tree 1306 of 2000

building

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numerical_transformation_pipeline',
                                                  Pipeline(steps=[('mean_imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())],
                                                           verbose=True),
                                                  ['Age', 'Work_Experience',
                                                   'Family_Size']),
                                                 ('categorical_transformation_pipeline_unknown',
                                                  Pipeline(steps=[('unknown_imputer',
                                                                   SimpleImputer(fill_value='Unknown'...
                                         

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 213 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 384 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 605 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 874 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 1193 tasks      | elapsed:    0.6s
[Parallel(n_jobs=4)]: Done 1560 tasks      | elapsed:    0.9s
[Parallel(n_jobs=4)]: Done 1977 tasks      | elapsed:    1.1s
[Parallel(n_jobs=4)]: Done 2000 out of 2000 | elapsed:    1.1s finished


0.523543990086741

In [29]:
extratrees_pipeline.fit(X_train, y_train)
y_pred = extratrees_pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

 tree 1270 of 2000

building tree 1271 of 2000building tree 1272 of 2000

building tree 1273 of 2000building tree 1274 of 2000

building tree 1275 of 2000
building tree 1276 of 2000building tree 1277 of 2000

building tree 1278 of 2000
building tree 1279 of 2000building tree 1280 of 2000

building tree 1281 of 2000building tree 1282 of 2000
building tree 1283 of 2000building tree 1284 of 2000


building tree 1285 of 2000
building tree 1286 of 2000
building tree 1287 of 2000
building tree 1288 of 2000building tree 1289 of 2000

building tree 1290 of 2000
building tree 1291 of 2000
building tree 1292 of 2000building tree 1293 of 2000

building tree 1294 of 2000
building tree 1295 of 2000building tree 1296 of 2000

building tree 1297 of 2000
building tree 1298 of 2000
building tree 1299 of 2000building tree 1300 of 2000

building tree 1301 of 2000
building tree 1302 of 2000
building tree 1303 of 2000building tree 1304 of 2000
building tree 1305 of 2000

building tree 1306 of 2000
building

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numerical_transformation_pipeline',
                                                  Pipeline(steps=[('mean_imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())],
                                                           verbose=True),
                                                  ['Age', 'Work_Experience',
                                                   'Family_Size']),
                                                 ('categorical_transformation_pipeline_unknown',
                                                  Pipeline(steps=[('unknown_imputer',
                                                                   SimpleImputer(fill_value='Unknown'...
                                         

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 213 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 384 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 605 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 874 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 1193 tasks      | elapsed:    0.6s
[Parallel(n_jobs=4)]: Done 1560 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 1977 tasks      | elapsed:    1.0s
[Parallel(n_jobs=4)]: Done 2000 out of 2000 | elapsed:    1.0s finished


0.5173482032218092

In [30]:
svc_pipeline.fit(X_train, y_train)
y_pred = svc_pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

[Pipeline] ...... (step 1 of 2) Processing mean_imputer, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing scaler, total=   0.0s
[ColumnTransformer]  (1 of 3) Processing numerical_transformation_pipeline, total=   0.0s
[Pipeline] ... (step 1 of 2) Processing unknown_imputer, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing onehot, total=   0.0s
[ColumnTransformer]  (2 of 3) Processing categorical_transformation_pipeline_unknown, total=   0.0s
[Pipeline] ...... (step 1 of 2) Processing mode_imputer, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing onehot, total=   0.0s
[ColumnTransformer]  (3 of 3) Processing categorical_transformation_pipeline_mode, total=   0.0s
[Pipeline] ...... (step 1 of 2) Processing preprocessor, total=   0.1s
[LibSVM][Pipeline] ........ (step 2 of 2) Processing classifier, total=   4.2s


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numerical_transformation_pipeline',
                                                  Pipeline(steps=[('mean_imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())],
                                                           verbose=True),
                                                  ['Age', 'Work_Experience',
                                                   'Family_Size']),
                                                 ('categorical_transformation_pipeline_unknown',
                                                  Pipeline(steps=[('unknown_imputer',
                                                                   SimpleImputer(fill_value='Unknown'...
                                         

0.48141263940520446

In [31]:
xgboost_pipeline.fit(X_train, y_train)
y_pred = xgboost_pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

[Pipeline] ...... (step 1 of 2) Processing mean_imputer, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing scaler, total=   0.0s
[ColumnTransformer]  (1 of 3) Processing numerical_transformation_pipeline, total=   0.0s
[Pipeline] ... (step 1 of 2) Processing unknown_imputer, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing onehot, total=   0.0s
[ColumnTransformer]  (2 of 3) Processing categorical_transformation_pipeline_unknown, total=   0.0s
[Pipeline] ...... (step 1 of 2) Processing mode_imputer, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing onehot, total=   0.0s
[ColumnTransformer]  (3 of 3) Processing categorical_transformation_pipeline_mode, total=   0.0s
[Pipeline] ...... (step 1 of 2) Processing preprocessor, total=   0.1s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=  18.5s


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numerical_transformation_pipeline',
                                                  Pipeline(steps=[('mean_imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())],
                                                           verbose=True),
                                                  ['Age', 'Work_Experience',
                                                   'Family_Size']),
                                                 ('categorical_transformation_pipeline_unknown',
                                                  Pipeline(steps=[('unknown_imputer',
                                                                   SimpleImputer(fill_value='Unknown'...
                               interactio

0.4851301115241636

In [32]:
stacking_pipeline.fit(X_train, y_train)
y_pred = stacking_pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

[Pipeline] ...... (step 1 of 2) Processing mean_imputer, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing scaler, total=   0.0s
[ColumnTransformer]  (1 of 3) Processing numerical_transformation_pipeline, total=   0.0s
[Pipeline] ... (step 1 of 2) Processing unknown_imputer, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing onehot, total=   0.0s
[ColumnTransformer]  (2 of 3) Processing categorical_transformation_pipeline_unknown, total=   0.0s
[Pipeline] ...... (step 1 of 2) Processing mode_imputer, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing onehot, total=   0.0s
[ColumnTransformer]  (3 of 3) Processing categorical_transformation_pipeline_mode, total=   0.0s
[Pipeline] ...... (step 1 of 2) Processing preprocessor, total=   0.1s


KeyboardInterrupt: 

In [None]:
bagging_pipeline.fit(X_train, y_train)
y_pred = bagging_pipeline.predict(X_test)
accuracy_score(y_test, y_pred)