In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

In [30]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import set_config

In [26]:
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier

In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [4]:
data = pd.read_csv('../input/train.csv', index_col='ID')
data.sort_index(axis=0, inplace=True)

In [5]:
data.shape
data.head()

(8068, 10)

Unnamed: 0_level_0,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
458982,Male,Yes,61,Yes,Executive,1.0,High,3.0,Cat_6,C
458983,Female,Yes,63,Yes,Executive,0.0,High,5.0,Cat_6,C
458984,Male,Yes,39,Yes,Artist,0.0,Average,3.0,Cat_6,C
458985,Male,No,23,No,Healthcare,1.0,Low,4.0,Cat_6,D
458986,Male,No,18,No,Healthcare,7.0,Low,4.0,Cat_6,D


In [6]:
X = data.iloc[ : , :-1]
y = data.iloc[ :, -1]

In [7]:
X_test, X_train, y_test, y_train = train_test_split(X, y, test_size=2, random_state=0)

In [9]:
X_train.dtypes

Gender              object
Ever_Married        object
Age                  int64
Graduated           object
Profession          object
Work_Experience    float64
Spending_Score      object
Family_Size        float64
Var_1               object
dtype: object

In [10]:
X_train.isna().sum()
X_test.isna().sum()

Gender             0
Ever_Married       1
Age                0
Graduated          0
Profession         0
Work_Experience    1
Spending_Score     0
Family_Size        0
Var_1              0
dtype: int64

Gender               0
Ever_Married       139
Age                  0
Graduated           78
Profession         124
Work_Experience    828
Spending_Score       0
Family_Size        335
Var_1               76
dtype: int64

In [12]:
mean_imputer = SimpleImputer(strategy='mean')
median_imputer = SimpleImputer(strategy='median')
mode_imputer = SimpleImputer(strategy='most_frequent')
unknown_imputer = SimpleImputer(strategy='constant', fill_value='Unknown')

In [14]:
std_scalar = StandardScaler()
onehot_encoder = OneHotEncoder(drop="first", handle_unknown='raise')
label_encoder = LabelEncoder()

In [27]:
randomforest = RandomForestClassifier(
    n_jobs=-1, n_estimators=100, random_state=4, verbose=1,
)

In [15]:
X_train.dtypes

Gender              object
Ever_Married        object
Age                  int64
Graduated           object
Profession          object
Work_Experience    float64
Spending_Score      object
Family_Size        float64
Var_1               object
dtype: object

In [17]:
# class CategoricalTransformer( BaseEstimator, TransformerMixin ):
#     #Class constructor method that takes in a list of values as its argument
#     def __init__(self, cat_features):
#         self._cat_features = cat_features
        
#     #Return self nothing else to do here
#     def fit( self, X, y = None  ):
#         return self

#     #Transformer method we wrote for this transformer 
#     def transform(self, X , y = None ):
#        #Depending on constructor argument break dates column into specified units
#        #using the helper functions written above 
#        for feature in self._cat_features:
#            if feature == 'Var_1':
#                mode_imputer.fit(X[[feature]])
#                X[feature] = mode_imputer.transform(X[[feature]])
#             else :
#                 unknown_imputer.fo
#        return X.values 
#work in progress

In [19]:
categorical_feature_mode = ['Var_1']
categorical_feature_unknown = ['Gender', 'Ever_Married', 'Graduated', 'Profession', 'Spending_Score']
numerical_features = ['Age', 'Work_Experience', 'Family_Size']

In [22]:
numerical_transformer = Pipeline(
    steps=[("mean_imputer", mean_imputer), ("scaler", std_scalar)], verbose=True
)
categorical_transformer_unknown = Pipeline(steps=[("unknown_imputer", unknown_imputer), ("onehot", onehot_encoder)], verbose=True)

categorical_transformer_mode = Pipeline(steps=[("mode_imputer", mode_imputer), ("onehot", onehot_encoder)], verbose=True)

In [24]:
#%%
preprocessor = ColumnTransformer(
    transformers=[
        (
            "numerical_transformation_pipeline",
            numerical_transformer,
            numerical_features,
        ),
        (
            "categorical_transformation_pipeline_unknown",
            categorical_transformer_unknown,
            categorical_feature_unknown,
        ),
        (
            "categorical_transformation_pipeline_mode",
            categorical_transformer_mode,
            categorical_feature_mode,
        ),
        
    ],
    verbose=True,
)

In [28]:
randomforest_pipeline = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", randomforest)], verbose=True
)

In [31]:
set_config(display='diagram')

In [12]:
data['Graduated'].value_counts(dropna=False)
data['Graduated'].fillna(value='Unknown', inplace=True)
data['Graduated'].value_counts(dropna=False)

Yes    4968
No     3022
NaN      78
Name: Graduated, dtype: int64

Yes        4968
No         3022
Unknown      78
Name: Graduated, dtype: int64

In [13]:
data['Ever_Married'].value_counts(dropna=False)
data['Ever_Married'].fillna(value='Unknown', inplace=True)
data['Ever_Married'].value_counts(dropna=False)

Yes    4643
No     3285
NaN     140
Name: Ever_Married, dtype: int64

Yes        4643
No         3285
Unknown     140
Name: Ever_Married, dtype: int64

In [14]:
data['Profession'].value_counts(normalize=True, dropna=False)
data['Profession'].fillna(value='Unknown', inplace=True)
data['Profession'].value_counts(normalize=True, dropna=False)

Artist           0.311849
Healthcare       0.165097
Entertainment    0.117625
Engineer         0.086639
Doctor           0.085275
Lawyer           0.077219
Executive        0.074244
Marketing        0.036192
Homemaker        0.030491
NaN              0.015369
Name: Profession, dtype: float64

Artist           0.311849
Healthcare       0.165097
Entertainment    0.117625
Engineer         0.086639
Doctor           0.085275
Lawyer           0.077219
Executive        0.074244
Marketing        0.036192
Homemaker        0.030491
Unknown          0.015369
Name: Profession, dtype: float64

In [15]:
data['Var_1'].value_counts(normalize=True, dropna=False)
data['Var_1'].fillna(value=data['Var_1'].mode()[0], inplace=True)
data['Var_1'].value_counts(normalize=True, dropna=False)

Cat_6    0.649232
Cat_4    0.134978
Cat_3    0.101884
Cat_2    0.052305
Cat_7    0.025161
Cat_1    0.016485
Cat_5    0.010535
NaN      0.009420
Name: Var_1, dtype: float64

Cat_6    0.658651
Cat_4    0.134978
Cat_3    0.101884
Cat_2    0.052305
Cat_7    0.025161
Cat_1    0.016485
Cat_5    0.010535
Name: Var_1, dtype: float64

In [16]:
data['Work_Experience'].value_counts(normalize=True, dropna=False)
data['Work_Experience'].fillna(value=data['Work_Experience'].mean().round(), inplace=True)
data['Work_Experience'].value_counts(normalize=True, dropna=False)

1.0     0.291770
0.0     0.287308
NaN     0.102752
9.0     0.058751
8.0     0.057387
2.0     0.035449
3.0     0.031606
4.0     0.031358
6.0     0.025285
7.0     0.024294
5.0     0.024046
10.0    0.006569
11.0    0.006197
12.0    0.005949
13.0    0.005702
14.0    0.005578
Name: Work_Experience, dtype: float64

1.0     0.291770
0.0     0.287308
3.0     0.134358
9.0     0.058751
8.0     0.057387
2.0     0.035449
4.0     0.031358
6.0     0.025285
7.0     0.024294
5.0     0.024046
10.0    0.006569
11.0    0.006197
12.0    0.005949
13.0    0.005702
14.0    0.005578
Name: Work_Experience, dtype: float64

In [17]:
data['Family_Size'].value_counts(normalize=True, dropna=False)
data['Family_Size'].fillna(value=data['Family_Size'].median(), inplace=True)
data['Family_Size'].value_counts(normalize=True, dropna=False)

2.0    0.296232
3.0    0.185548
1.0    0.180094
4.0    0.170922
5.0    0.075855
NaN    0.041522
6.0    0.026277
7.0    0.011899
8.0    0.006197
9.0    0.005454
Name: Family_Size, dtype: float64

2.0    0.296232
3.0    0.227070
1.0    0.180094
4.0    0.170922
5.0    0.075855
6.0    0.026277
7.0    0.011899
8.0    0.006197
9.0    0.005454
Name: Family_Size, dtype: float64

In [19]:
X = data.iloc[ : , :-1]
y = data.iloc[ :, -1]