## Customer Segmentation

An automobile company has plans to enter new markets with their existing products (P1, P2, P3, P4 and P5). After intensive market research, they’ve deduced that the behavior of new market is similar to their existing market. 

In their existing market, the sales team has classified all customers into 4 segments (A, B, C, D ). Then, they performed segmented outreach and communication for different segment of customers. This strategy has work exceptionally well for them. They plan to use the same strategy on new markets and have identified 2627 new potential customers. 

In [1]:
## import all libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [1]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [1]:
train.shape,test.shape

In [1]:
train.head()

In [1]:
test.head()

In [1]:
train.isnull().sum()

In [1]:
train.groupby(['Gender','Var_1'])['Ever_Married'].count().plot(kind='bar')

In [1]:
train.groupby(['Spending_Score','Var_1'])['Ever_Married'].count().plot(kind='bar')

In [1]:
train.isnull().sum()

In [1]:
test.isnull().sum()

In [1]:
# lets create concatenated dataset for easy impute
df = pd.concat([train,test])
print(df.shape)
df.head()

In [1]:
# lets take a look into categorical features
cat_feat = [feat for feat in df.columns if df[feat].dtypes=='O' and feat != 'Segmentation']
print("Categorical features:",cat_feat)
from matplotlib import pyplot
for f in cat_feat:
    fig, ax = pyplot.subplots(figsize=(12,4))
    sns.countplot(df[f],ax=ax)
    plt.show()

In [1]:
# lets take a look into numerical features
num_feat = [feat for feat in df.columns if df[feat].dtypes !='O' and feat != 'ID']
print("Categorical features:",num_feat)
from matplotlib import pyplot
for f in num_feat:
    fig, ax = pyplot.subplots(figsize=(10,4))
    sns.distplot(df[f],ax=ax)
    plt.show()

In [1]:
#df[num_feat] = np.log(df[num_feat])

In [1]:
# fill missing values
df['Work_Experience'] = df['Work_Experience'].fillna(df['Work_Experience'].mode()[0])
df['Ever_Married'] = df['Ever_Married'].fillna('Missing')
df['Graduated'] = df['Graduated'].fillna('Missing')
df['Profession'] = df['Profession'].fillna('Missing')
df['Family_Size'] = df['Family_Size'].fillna(0)
df['Var_1'] = df['Var_1'].fillna('Cat_0')
df.head()

In [1]:
#Adding more Features
df['Unique_profession_per_agegroup']=df.groupby(['Age'])['Profession'].transform('nunique')
df['Unique_agegroup_per_profession']=df.groupby(['Profession'])['Age'].transform('nunique')
df['Age_Family_size']=df.groupby(['Age'])['Family_Size'].transform('nunique')
df.head()

In [1]:
df.isnull().sum()

In [1]:
#df.Segmentation.unique()

In [1]:
#Encoding Category Variables using frequency encoding
def frequency_encoding(col):
    fe=df.groupby(col).size()/len(df)
    df[col]=df[col].apply(lambda x: fe[x])
    
for col in list(df.select_dtypes(include=['object']).columns):
    if col!='Segmentation':
        frequency_encoding(col)

df['Segmentation'] = df['Segmentation'].map({'A':1,'B':2,'C':3,'D':4})

In [1]:
df.head()

In [1]:
train = df[:8068]
test = df[8068:]
print(train.shape,test.shape)

In [1]:
## 90% Train Data is repeated in Test set so seperating the ID's which are common both in test and train set
## we will use segements from train data for test id which are common.
## we will only predict for test ids which are not maching with train data.
submission_df = pd.merge(train,test,on='ID',how='inner')
submission_df=submission_df[['ID','Segmentation_x']]
submission_df.columns=['ID','Segmentation']
print(submission_df.shape)
submission_df.head()

In [1]:
### get the test ids which are not found in train data
md_df = pd.concat([pd.DataFrame(submission_df['ID']),pd.DataFrame(test['ID'])]).drop_duplicates(keep=False)
print(md_df.shape)
md_df.head()

In [1]:
### lets get the whole data columns for these test ids using merge with actual full test file
test_df=pd.merge(md_df,test,on='ID',how='inner')
test_df

In [1]:
### create test file for prediction using ensmble methods
test_c = test_df.copy()
test = test_df.drop(['ID','Segmentation'],axis=1)

X_train = train.drop(['ID','Segmentation'],axis=1)
y_train = train['Segmentation']

In [1]:
X_train.info()

In [1]:
# from sklearn.linear_model import Lasso
# from sklearn.feature_selection import SelectFromModel
# model = SelectFromModel(Lasso(alpha=0.005,random_state=0))
# model.fit(X_train,y_train)
# model.get_support()

In [1]:
# imp = ['Ever_Married','Age','Graduated','Profession','Family_Size']

In [1]:
# from sklearn.model_selection import train_test_split
# X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.3,random_state=0)

In [1]:
# feature1 = ['Gender','Ever_Married','Work_Experience']
# feature2 = ['Age','Graduated','Profession']
# feature3 = ['Spending_Score','Family_Size','Var_1']

In [1]:
# X_train1 = X_train[feature1 + feature2]
# X_train2 = X_train[feature2 + feature3]
# X_train3 = X_train[feature1 + feature3]
# X_test1 = test[feature1 + feature2]
# X_test2 = test[feature2 + feature3]
# X_test3 = test[feature1 + feature3]

### XGBoost

In [1]:
import xgboost
XGB = xgboost.XGBClassifier(booster='gbtree',verbose=0,learning_rate=0.07,max_depth=8,objective='multi:softmax',
                  n_estimators=1000,seed=294)

XGB.fit(X_train, y_train)
# Predicting the Test set results
y_pred_XGB = XGB.predict(test)
acc_XGB = round(XGB.score(X_train, y_train) * 100, 2)
print ("Train Accuracy: " + str(acc_XGB) + '%')

### LIGHTGBM

In [1]:
LGB=LGBMClassifier(boosting_type='gbdt', max_depth=10, learning_rate=0.09, objective='multiclass', reg_alpha=0,
                  reg_lambda=1, n_jobs=-1, random_state=100, n_estimators=1000)

LGB.fit(X_train,y_train)
# Predicting the Test set results
y_pred_LGB = LGB.predict(test)
acc_LGB = round(LGB.score(X_train, y_train) * 100, 2)
print ("Train Accuracy: " + str(acc_LGB) + '%')

### CATBoost

In [1]:
from catboost import CatBoostClassifier

CATB=CatBoostClassifier(learning_rate=0.05,depth=8,boosting_type='Plain',eval_metric='Accuracy',n_estimators=1000,random_state=294)
CATB.fit(X_train,y_train)
# Predicting the Test set results
y_pred_CATB = CATB.predict(test)
acc_CATB = round(CATB.score(X_train, y_train) * 100, 2)
print ("Train Accuracy: " + str(acc_CATB) + '%')

In [1]:
# from sklearn.ensemble import RandomForestClassifier, VotingClassifier
# votingC = VotingClassifier(estimators=[('XGB_1',classifier1),('XGB_2',classifier2),('XGB_3',classifier3)], voting='soft', n_jobs=4)

# votingC = votingC.fit(X_train, y)

In [1]:
# vote = votingC.predict(test)
# submission_v = pd.DataFrame({
#         "ID": test_c["ID"],
#         "Segmentation":vote
#     })

# submission['Segmentation'] = submission['Segmentation'].map({1:'A',2:'B',3:'C',4:'D'})
# submission.to_csv('cust_submission.csv', index=False)

### Random forest 

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV

In [1]:
# import numpy as np
# from sklearn.model_selection import RandomizedSearchCV
# # Number of trees in random forest
# n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 5)]
# # Number of features to consider at every split
# max_features = ['auto', 'sqrt','log2']
# # Maximum number of levels in tree
# max_depth = [int(x) for x in np.linspace(10, 50, 5)]
# # Minimum number of samples required to split a node
# min_samples_split = [2, 5, 10]
# # Minimum number of samples required at each leaf node
# min_samples_leaf = [1, 2,4]
# # Create the random grid
# random_grid = {'n_estimators': n_estimators,
#                'max_features': max_features,
#                'max_depth': max_depth,
#                'min_samples_split': min_samples_split,
#                'min_samples_leaf': min_samples_leaf,
#               'criterion':['entropy','gini']}
# print(random_grid)

# rf=RandomForestClassifier()
# rf_randomcv=RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=50,cv=3,verbose=2,
#                                random_state=100,n_jobs=-1)
# ### fit the randomized model
# rf_randomcv.fit(X_train,y_train)

In [1]:
#rf_randomcv.best_params_

In [1]:
#best_random_grid=rf_randomcv.best_estimator_
#best_random_grid

In [1]:
#rfc = RandomForestClassifier(n_estimators=1000,max_depth=20,random_state=9,verbose=1)
#rfc.fit(X_train,y_train)
#y_pred_rfc = rfc.predict(test)
#score = rfc.score(X_train, y_train)
#print(score)
# rfc = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
#             max_depth=20, max_features='auto', max_leaf_nodes=None,
#             min_impurity_decrease=0.0, min_impurity_split=None,
#             min_samples_leaf=4, min_samples_split=10,
#             min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
#             oob_score=False, random_state=None, verbose=0,
#             warm_start=False)
# rfc.fit(X_train,y_train)
# y_pred_rfc = best_random_grid.predict(test)

### K-Nearest Neighbor

In [1]:
# #Import knearest neighbors Classifier model
# from sklearn.metrics import classification_report, confusion_matrix
# from sklearn.model_selection import cross_val_score
# from sklearn.neighbors import KNeighborsClassifier

# #Create KNN Classifier
# knn = KNeighborsClassifier(n_neighbors=4)

# #Train the model using the training sets
# knn.fit(X_train, y_train)

# #Predict the response for test dataset
# y_pred_knn = knn.predict(test)

In [1]:
# accuracy_rate = []
# for i in range(1,40):
#     knn = KNeighborsClassifier(n_neighbors=i)
#     score = cross_val_score(knn,X_train, y_train,cv=10)
#     accuracy_rate.append(score.mean())
# accuracy_rate

In [1]:
# error_rate = []
# for i in range(1,40):
#     knn = KNeighborsClassifier(n_neighbors=i)
#     score = cross_val_score(knn,X_train, y_train,cv=10)
#     error_rate.append(1-score.mean())  
# error_rate

In [1]:
# plt.figure(figsize=(10,6))
# plt.plot(range(1,40),error_rate,color='blue', linestyle='dashed', marker='o',
#         markerfacecolor='red', markersize=10)
# #plt.plot(range(1,40),accuracy_rate,color='blue', linestyle='dashed', marker='o',
# #         markerfacecolor='red', markersize=10)
# plt.title('Error Rate vs. K Value')
# plt.xlabel('K')
# plt.ylabel('Error Rate')

In [1]:
# #Create KNN Classifier
# knn = KNeighborsClassifier(n_neighbors=10)

# #Train the model using the training sets
# knn.fit(X_train, y_train)

# #Predict the response for test dataset
# y_pred_knn = knn.predict(test)

### Ensemble of XGBoost, CATBoost, LightGBM

In [1]:
d=pd.DataFrame()
d=pd.concat([d,pd.DataFrame(CATB.predict(test)),pd.DataFrame(XGB.predict(test)),pd.DataFrame(LGB.predict(test))],axis=1)
d.columns=['1','2','3']

re=d.mode(axis=1)[0]
re.head()

In [1]:
## create submission Data frame

submission = pd.DataFrame({
        "ID": test_c["ID"],
        "Segmentation":re
    })

submission=pd.concat([submission_df,submission])
submission['Segmentation'] = submission['Segmentation'].map({1.0:'A',2.0:'B',3.0:'C',4.0:'D'})
submission.to_csv('Customer_Segmentation_submission.csv', index=False)

In [1]:
submission.Segmentation.value_counts()