In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-whitegrid')

import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.linear_model import LogisticRegression

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_colwidth', -1)

import warnings
warnings.simplefilter('ignore')



In [2]:
ss = pd.read_csv('data/sample_submission_wyi0h0z.csv')
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [3]:
train.shape,test.shape

((8068, 11), (2627, 10))

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8068 entries, 0 to 8067
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               8068 non-null   int64  
 1   Gender           8068 non-null   object 
 2   Ever_Married     7928 non-null   object 
 3   Age              8068 non-null   int64  
 4   Graduated        7990 non-null   object 
 5   Profession       7944 non-null   object 
 6   Work_Experience  7239 non-null   float64
 7   Spending_Score   8068 non-null   object 
 8   Family_Size      7733 non-null   float64
 9   Var_1            7992 non-null   object 
 10  Segmentation     8068 non-null   object 
dtypes: float64(2), int64(2), object(7)
memory usage: 693.5+ KB


In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train['Segmentation'] = le.fit_transform(train['Segmentation'])

In [8]:
ID_COL, TARGET_COL = 'ID', 'Segmentation'
features = [c for c in train.columns if c not in [ID_COL, TARGET_COL]]

cat_cols = ['Gender',
 'Ever_Married',
 'Graduated',
 'Profession',
 'Spending_Score',
 'Var_1']

num_cols = [c for c in features if c not in cat_cols]

In [9]:
def download_preds(preds_test, file_name = 'cust_seg.csv'):
    

  ## 1. Setting the target column with our obtained predictions
      ss[TARGET_COL] = preds_test
    
      ss[TARGET_COL] = le.inverse_transform(ss[TARGET_COL])
    


  ## 2. Saving our predictions to a csv file

      ss.to_csv(file_name, index = False)

##  Classification Models.

In [10]:
df = pd.concat([train, test], axis=0).reset_index(drop = True)
df.shape

(10695, 11)

In [13]:
df = pd.get_dummies(df, columns = cat_cols)

In [14]:
df.isnull().sum()

ID                          0   
Age                         0   
Work_Experience             1098
Family_Size                 448 
Segmentation                2627
Gender_Female               0   
Gender_Male                 0   
Ever_Married_No             0   
Ever_Married_Yes            0   
Graduated_No                0   
Graduated_Yes               0   
Profession_Artist           0   
Profession_Doctor           0   
Profession_Engineer         0   
Profession_Entertainment    0   
Profession_Executive        0   
Profession_Healthcare       0   
Profession_Homemaker        0   
Profession_Lawyer           0   
Profession_Marketing        0   
Spending_Score_Average      0   
Spending_Score_High         0   
Spending_Score_Low          0   
Var_1_Cat_1                 0   
Var_1_Cat_2                 0   
Var_1_Cat_3                 0   
Var_1_Cat_4                 0   
Var_1_Cat_5                 0   
Var_1_Cat_6                 0   
Var_1_Cat_7                 0   
dtype: int

In [15]:
df['Work_Experience'] = df['Work_Experience'].fillna(df['Work_Experience'].median())
df['Family_Size'] = df['Family_Size'].fillna(df['Family_Size'].median())

In [16]:
df.isnull().sum()

ID                          0   
Age                         0   
Work_Experience             0   
Family_Size                 0   
Segmentation                2627
Gender_Female               0   
Gender_Male                 0   
Ever_Married_No             0   
Ever_Married_Yes            0   
Graduated_No                0   
Graduated_Yes               0   
Profession_Artist           0   
Profession_Doctor           0   
Profession_Engineer         0   
Profession_Entertainment    0   
Profession_Executive        0   
Profession_Healthcare       0   
Profession_Homemaker        0   
Profession_Lawyer           0   
Profession_Marketing        0   
Spending_Score_Average      0   
Spending_Score_High         0   
Spending_Score_Low          0   
Var_1_Cat_1                 0   
Var_1_Cat_2                 0   
Var_1_Cat_3                 0   
Var_1_Cat_4                 0   
Var_1_Cat_5                 0   
Var_1_Cat_6                 0   
Var_1_Cat_7                 0   
dtype: int

### 4. Split the processed dataset back into train and test datasets.

In [17]:
train_proc, test_proc = df[:train.shape[0]], df[train.shape[0]:].reset_index(drop = True)

features = [c for c in train_proc.columns if c not in [ID_COL, TARGET_COL]]

In [18]:
len(features)

28

In [21]:
train_proc[TARGET_COL] = train_proc[TARGET_COL].astype(int)

### 5. Split the train set into train and validation sets.

In [22]:
trn, val = train_test_split(train_proc, test_size=0.2, random_state = 1, stratify = train_proc[TARGET_COL])

###### Input to our model will be the features
X_trn, X_val = trn[features], val[features]

###### Output of our model will be the TARGET_COL
y_trn, y_val = trn[TARGET_COL], val[TARGET_COL]

##### Features for the test data that we will be predicting
X_test = test_proc[features]

In [23]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
_ = scaler.fit(X_trn)

X_trn = scaler.transform(X_trn)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

### A. Logistic

In [29]:
clf = LogisticRegression(random_state = 1,multi_class="auto")
_ = clf.fit(X_trn, y_trn)

preds_val = clf.predict(X_val)

accuracy_score(y_val, preds_val)

0.5061957868649318

In [30]:
preds_test = clf.predict(X_test)
download_preds(preds_test, file_name='hacklive_logistic.csv')

### B. Decision Tree

In [32]:
clf = DecisionTreeClassifier(random_state = 1)
_ = clf.fit(X_trn, y_trn)

preds_val = clf.predict(X_val)

accuracy_score(y_val, preds_val)

0.4318463444857497

In [33]:
preds_test = clf.predict(X_test)
download_preds(preds_test, file_name='hacklive_decision_tree.csv')

### C. Random Forest

In [35]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=1,n_estimators=500)
_ = clf._ = clf.fit(X_trn, y_trn)

preds_val = clf.predict(X_val)

accuracy_score(y_val, preds_val)

0.48141263940520446

In [36]:
preds_test = clf.predict(X_test)
download_preds(preds_test, file_name='hacklive_random_tree.csv')

## Hyperparameter Tuning

In [37]:
from sklearn.model_selection import RandomizedSearchCV

hyperparam_combs = {
    'max_depth': [4, 6, 8, 10, 12],
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [2, 10, 20, 30, 40],
    'max_features': [0.2, 0.4, 0.6, 0.8, 1],
    'max_leaf_nodes': [8, 16, 32, 64, 128]
}

clf = RandomizedSearchCV(DecisionTreeClassifier(),
                         hyperparam_combs,
                         random_state=1,
                         n_iter=20)

search = clf.fit(train_proc[features], train_proc[TARGET_COL])

search.best_params_

{'min_samples_split': 30,
 'max_leaf_nodes': 32,
 'max_features': 0.8,
 'max_depth': 8,
 'criterion': 'entropy'}

In [38]:
optimal_params = {'min_samples_split': 30,
 'max_leaf_nodes': 32,
 'max_features': 0.8,
 'max_depth': 8,
 'criterion': 'entropy'}

clf = DecisionTreeClassifier(random_state = 1, **optimal_params)
_ = clf.fit(X_trn, y_trn)

preds_val = clf.predict(X_val)

accuracy_score(y_val, preds_val)

0.5216852540272615

In [39]:
preds_test = clf.predict(X_test)

download_preds(preds_test, file_name = 'hacklive_dt_tuned_random.csv')

## K-Fold Validation

In [40]:
from sklearn.model_selection import StratifiedKFold

In [41]:
def run_clf_kfold(clf, train, test, features):

  N_SPLITS = 5

  oofs = np.zeros(len(train))
  preds = np.zeros((len(test)))

  folds = StratifiedKFold(n_splits = N_SPLITS)

  for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train[TARGET_COL])):
    print(f'\n------------- Fold {fold_ + 1} -------------')

    ############# Get train, validation and test sets along with targets ################
  
    ### Training Set
    X_trn, y_trn = train[features].iloc[trn_idx], target.iloc[trn_idx]

    ### Validation Set
    X_val, y_val = train[features].iloc[val_idx], target.iloc[val_idx]

    ### Test Set
    X_test = test[features]

    ############# Scaling Data ################
    scaler = StandardScaler()
    _ = scaler.fit(X_trn)

    X_trn = scaler.transform(X_trn)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)


    ############# Fitting and Predicting ################

    _ = clf.fit(X_trn, y_trn)

    ### Instead of directly predicting the classes we will obtain the probability of positive class.
#     preds_val = clf.predict_proba(X_val)[:, 1]
#     preds_test = clf.predict_proba(X_test)[:, 1]

    preds_val = clf.predict(X_val)
    preds_test = clf.predict(X_test)
    
    fold_score = accuracy_score(y_val, preds_val)
    print(f'\naccuracy score for validation set is {fold_score}')

    oofs[val_idx] = preds_val
    preds += preds_test / N_SPLITS


  oofs_score = accuracy_score(y_val, preds_val)
  print(f'\n\naccuracy score for oofs is {oofs_score}')

  return oofs, preds

In [42]:
target = train[TARGET_COL]
target

0       3
1       0
2       1
3       1
4       0
       ..
8063    3
8064    3
8065    3
8066    1
8067    1
Name: Segmentation, Length: 8068, dtype: int32

In [43]:
dt_params = {'min_samples_split': 30,
 'max_leaf_nodes': 32,
 'max_features': 0.8,
 'max_depth': 8,
 'criterion': 'entropy'}

clf = DecisionTreeClassifier(**dt_params)
        

dt_oofs, dt_preds = run_clf_kfold(clf, train_proc, test_proc, features)


------------- Fold 1 -------------

accuracy score for validation set is 0.5328376703841388

------------- Fold 2 -------------

accuracy score for validation set is 0.4950433705080545

------------- Fold 3 -------------

accuracy score for validation set is 0.5130111524163569

------------- Fold 4 -------------

accuracy score for validation set is 0.536887786732796

------------- Fold 5 -------------

accuracy score for validation set is 0.5244885306881587


accuracy score for oofs is 0.5244885306881587


## Gradient Boosting

### LightGBM

In [51]:
from lightgbm import LGBMClassifier
clf =  LGBMClassifier()
_ = clf._ = clf.fit(X_trn, y_trn)

preds_val = clf.predict(X_val)

accuracy_score(y_val, preds_val)

0.5371747211895911

In [52]:
preds_test = clf.predict(X_test)

download_preds(preds_test, file_name = 'hacklive__lgbm.csv')