In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In this treatise we'll start out with dataset of customer demographics and corresponding transaction data. The objective will be to train a model to existing customer data and leverage it to predict potential high-value customers for the client. 
The data has been pre-processed and cleaned accounting for white-spaces, duplicates etc 
We'll start by dealing with nulls, using appropriate imputation methods. Simple imputation should prove satisfcatory and regression imputation methods needn't be employed. 
Target encoding or One Hot Encoding can be leveraged to deal with categorical variables as the cardinality is low.
We'll then begin building the model. We'll explore various methods starting with supervised methods like decision forest regressors and XGBoost mechanisms, use mutual information to gauge which predictors are useful, and end with unsupervised methods like K-Means clustering and note which method is most effective in minimising the loss function. 

We begin by splitting the data into training and validation datasets. I'll process the categorical and numerical columns seperately so we'll create lists with categorical and numerical variables. 

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv('../input/kmpg-cleaned-final/KPMG Cleaned New  - CustomerDemographic.csv')
#dropping rows where RFMScore is null 
data1 = data.dropna(axis=0,subset=['RFM_Score'])
y=data1.RFM_Score 
features = ['Gender', 'past_3_years_bike_related_purchases','Age','job_industry_category', 'wealth_segment','owns_car','tenure', 'postcode', 'State', 'Property_Val']
X= data1[features].copy()
X_train, X_valid, y_train, y_valid = train_test_split(X,y,train_size= 0.8, test_size = 0.2, random_state=0)

categ_cols = [cname for cname in X_train.columns if X_train[cname].dtype=='object' ]
num_cols = [col for col in X_train.columns if X_train[col].dtype in ['int64','float64'] ]
print(X_train.isnull().sum())
X_train.shape[0]


In [5]:
data =[[10,'Tom'],[15,'Julie'],['NaN','Rob'],[20,'Rov'],[12,'NaN'],[13,'Sanj'],[23,'NaN']]
df=pd.DataFrame(data,columns=['Age','Name'])
df

import numpy as np

df['Name'] = df['Name'].replace('NaN', np.nan)
df['Age'] = df['Age'].replace('NaN', np.nan)
df1 = df.dropna(axis=0, subset=['Name'])
df1

Dealing with nulls in numerical data using Simple Imputer...

In [6]:
#Taking care of numerical nulls with Simple Imputer 
from sklearn.impute import SimpleImputer

X_num_t = X_train[num_cols]
X_num_v = X_valid[num_cols]

my_imputer = SimpleImputer(strategy= 'median')
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_num_t))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_num_v))

imputed_X_train.columns = X_num_t.columns
imputed_X_valid.columns = X_num_v.columns

imputed_X_train.head()
print(imputed_X_train.isnull().sum())
imputed_X_train.shape[0]

Dealing with nulls in categorical data using the most frequent observation

In [7]:
#Imputing most frequent observation in Nulls 
from sklearn.impute import SimpleImputer
categ_imputer = SimpleImputer(strategy='most_frequent')
X_cat_t = X_train[categ_cols]
X_cat_v = X_valid[categ_cols]

X_imp_cat_train = pd.DataFrame(categ_imputer.fit_transform(X_cat_t))
X_imp_cat_val = pd.DataFrame(categ_imputer.transform(X_cat_v))

X_imp_cat_train.columns = X_cat_t.columns
X_imp_cat_val.columns= X_cat_v.columns

X_imp_cat_train.head()
print(X_imp_cat_train.isnull().sum())

X_imp_cat_train.shape[0]

In [8]:
#Concatting imputed categorical and numerical data
X_train_new = pd.concat([imputed_X_train, X_imp_cat_train], axis=1)
X_valid_new = pd.concat([imputed_X_valid, X_imp_cat_val], axis=1)
X_train_new.head()

X_train_new.shape[0]


Models arent trained to accept categorical varaibles in raw form so we'll numerically encode such variables. Since the cardinality of varaibles in this dataset is low, we'll employ One Hot Encoding.

In [9]:
#OneHotEncoding for categorical variables

s = (X_train_new.dtypes == 'object')
object_cols = list(s[s].index)

from sklearn.preprocessing import OneHotEncoder
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse = False)
OH_X_train= pd.DataFrame(OH_encoder.fit_transform(X_train_new[object_cols]))
OH_X_valid= pd.DataFrame(OH_encoder.transform(X_valid_new[object_cols]))
OH_X_train.head()

#Putting index back 
OH_encoder.get_feature_names()
OH_X_train.columns= OH_encoder.get_feature_names()
OH_X_valid.columns =  OH_encoder.get_feature_names()
OH_X_valid.head()

#remove categ columns we'll replace with OHE cols 
num_X_train_new = X_train_new.drop(categ_cols, axis=1)
num_X_valid_new = X_valid_new.drop(categ_cols, axis=1)

#final data with OHE columns 
fin_X_train = pd.concat([num_X_train_new,OH_X_train], axis=1)
fin_X_valid = pd.concat([num_X_valid_new,OH_X_valid], axis=1)

fin_X_train.columns




We'll start with a basic Random Forest Regressor model and tune parameters to reduce the chosen loss function- here - the mean absolute error

In [10]:
#Lets build a model!
#method1 using RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

model = RandomForestRegressor(n_estimators=200,random_state =0)
model.fit(fin_X_train,y_train)
preds= model.predict(fin_X_valid)
mae = mean_absolute_error(y_valid,preds)
mae

This value is a little high. We'll explore other models to try to minimise the mae. New we explore a XGBRegressor model

In [11]:
#Using XGBoost model 
from xgboost import XGBRegressor

model_xgb = XGBRegressor(n_estimators=500, learning_rate=0.95)
model_xgb.fit(fin_X_train,y_train,
             early_stopping_rounds=5,
             eval_set=[(fin_X_valid,y_valid)],
             verbose= False)
predictions = model_xgb.predict(fin_X_valid)
mae2 = mean_absolute_error(predictions,y_valid)
mae2


Though on the surface the mean absolute error seems to have reduced,let's take a look at whats going on under the hood -

In [16]:
print(predictions[5])
print(predictions[34])
print(predictions[272])

Each value in the list of predictions seems to be identical, ie, the model seems to be predicting the same RFM score for every customer. Before drawing conclusions we'll ensure each value in the list 'predictions' is indeed the same 

In [29]:
x=0
for n in predictions :
    if n==n+1:
        x=x+1
print(x)


The above code returns 0 which tells us that every element in the list is indeed the same and the model is predicting the same RFM value for each customer. ie the model is grossly overfitted. 
Perhaps the data isn't large enough to identify trends, or perhaps no strong relationships exist between variables. Before proceeding, we'll subject our varaibles to a mutual informaation regression to gauge how useful each parameter is in predicting the RFM score 


In [12]:
#mutual information
from sklearn.feature_selection import mutual_info_regression
X= X.copy()

def make_mi_scores(X, y):
    mi_scores = mutual_info_regression(X, y, random_state =0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

mi_scores = make_mi_scores(fin_X_train, y_train)
mi_scores


Neither of the mutual information scores seem to be very high so instrinsically none of these variables seem to be very good predictors. For the purpose of experiment we'll attempt to use K-Means clsutering to segment the customer base. We'll then add the cluster identifier to the original dataset and see if that trains a more accurate model

In [33]:
#Ok Let's Try K-Means LMAO hahahahhAAAAAAAAAaaaaaaa
from sklearn.cluster import KMeans
from sklearn.model_selection import cross_val_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler



def score_dataset(X, y, model=XGBRegressor()):
    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    score = cross_val_score(
        model, X, y, cv=5, scoring="neg_mean_absolute_error",
    )
    score = -1 * score.mean()
    score = np.sqrt(score)
    return score

fin_Xt = fin_X_train.copy()
fin_Xv = fin_X_valid.copy()
fin_y = y_train.copy()

scale = StandardScaler()
fin_X_t_scaled = pd.DataFrame(scale.fit_transform(fin_Xt))
fin_X_v_scaled = pd.DataFrame(scale.fit_transform(fin_Xv))

fin_X_t_scaled.columns = fin_X_train.columns
fin_X_v_scaled.columns = fin_X_valid.columns

#We ask the model to identify clusters then add the clusters parameter as a new factor to the final data.
#We then run the XGBRegressor function again
kmeans = KMeans(n_clusters=5, n_init=7, random_state=0)
fin_Xt["Cluster"] = kmeans.fit_predict(fin_X_t_scaled)
fin_Xv["Cluster"] = kmeans.fit_predict(fin_X_v_scaled)

print(fin_Xt.head())

model_new = XGBRegressor(n_estimators=500, learning_rate=0.95)
model_new.fit(fin_Xt,y_train,
             early_stopping_rounds=7,
             eval_set=[(fin_Xv,y_valid)],
             verbose= False)
predict = model_new.predict(fin_Xv)
mae2 = mean_absolute_error(predict,y_valid)
print(mae2)



In conclusion our model hasn't been altogether extremely successful in identiying patterns and predicting scores for new customers that we could use to predict the potential of a new dataset of consumers. If we were simply looking to segment our existing consumer database K-Means algorithms would be satisfactory, however this may fall short in terms of predicting high-potential consumers. The best way was to add the cluster identifiers to the orginal data and train a supervised model on the same, but here too, the results were unsatisfactory.