In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import BaggingClassifier,RandomForestClassifier, AdaBoostClassifier, StackingClassifier

#### About the data:

Let’s consider a Company dataset with around 10 variables and 400 records. 

    The attributes are as follows: 
         Sales -- Unit sales (in thousands) at each location
         Competitor Price -- Price charged by competitor at each location
         Income -- Community income level (in thousands of dollars)
         Advertising -- Local advertising budget for company at each location (in thousands of dollars)
         Population -- Population size in region (in thousands)
         Price -- Price company charges for car seats at each site
         Shelf Location at stores -- A factor with levels Bad, Good and Medium indicating the quality of the shelving location for the car seats at each site
         Age -- Average age of the local population
         Education -- Education level at each location
         Urban -- A factor with levels No and Yes to indicate whether the store is in an urban or rural location
         US -- A factor with levels No and Yes to indicate whether the store is in the US or not
 
 
Problem Statement:
A cloth manufacturing company is interested to know about the segment or attributes causes high sale. 
Approach - A Random Forest can be built with target variable Sales (we will first convert it in categorical variable) & all other variable will be independent in the analysis. 

In [2]:
rawData = pd.read_csv('Company_Data.csv')
rawData

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.50,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.40,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No
...,...,...,...,...,...,...,...,...,...,...,...
395,12.57,138,108,17,203,128,Good,33,14,Yes,Yes
396,6.14,139,23,3,37,120,Medium,55,11,No,Yes
397,7.41,162,26,12,368,159,Medium,40,18,Yes,Yes
398,5.94,100,79,7,284,95,Bad,50,12,Yes,Yes


In [3]:
rawData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Sales        400 non-null    float64
 1   CompPrice    400 non-null    int64  
 2   Income       400 non-null    int64  
 3   Advertising  400 non-null    int64  
 4   Population   400 non-null    int64  
 5   Price        400 non-null    int64  
 6   ShelveLoc    400 non-null    object 
 7   Age          400 non-null    int64  
 8   Education    400 non-null    int64  
 9   Urban        400 non-null    object 
 10  US           400 non-null    object 
dtypes: float64(1), int64(7), object(3)
memory usage: 34.5+ KB


In [4]:
data = rawData.copy(deep=True)
data

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.50,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.40,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No
...,...,...,...,...,...,...,...,...,...,...,...
395,12.57,138,108,17,203,128,Good,33,14,Yes,Yes
396,6.14,139,23,3,37,120,Medium,55,11,No,Yes
397,7.41,162,26,12,368,159,Medium,40,18,Yes,Yes
398,5.94,100,79,7,284,95,Bad,50,12,Yes,Yes


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Sales        400 non-null    float64
 1   CompPrice    400 non-null    int64  
 2   Income       400 non-null    int64  
 3   Advertising  400 non-null    int64  
 4   Population   400 non-null    int64  
 5   Price        400 non-null    int64  
 6   ShelveLoc    400 non-null    object 
 7   Age          400 non-null    int64  
 8   Education    400 non-null    int64  
 9   Urban        400 non-null    object 
 10  US           400 non-null    object 
dtypes: float64(1), int64(7), object(3)
memory usage: 34.5+ KB


In [6]:
data.ShelveLoc = data.ShelveLoc.map({'Bad':0,'Medium':1,'Good':2})
data.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,0,42,17,Yes,Yes
1,11.22,111,48,16,260,83,2,65,10,Yes,Yes
2,10.06,113,35,10,269,80,1,59,12,Yes,Yes
3,7.4,117,100,4,466,97,1,55,14,Yes,Yes
4,4.15,141,64,3,340,128,0,38,13,Yes,No


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Sales        400 non-null    float64
 1   CompPrice    400 non-null    int64  
 2   Income       400 non-null    int64  
 3   Advertising  400 non-null    int64  
 4   Population   400 non-null    int64  
 5   Price        400 non-null    int64  
 6   ShelveLoc    400 non-null    int64  
 7   Age          400 non-null    int64  
 8   Education    400 non-null    int64  
 9   Urban        400 non-null    object 
 10  US           400 non-null    object 
dtypes: float64(1), int64(8), object(2)
memory usage: 34.5+ KB


In [8]:
data = pd.get_dummies(data)
data.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban_No,Urban_Yes,US_No,US_Yes
0,9.5,138,73,11,276,120,0,42,17,0,1,0,1
1,11.22,111,48,16,260,83,2,65,10,0,1,0,1
2,10.06,113,35,10,269,80,1,59,12,0,1,0,1
3,7.4,117,100,4,466,97,1,55,14,0,1,0,1
4,4.15,141,64,3,340,128,0,38,13,0,1,1,0


In [9]:
X = data.iloc[:,1:].values

In [10]:
X

array([[138,  73,  11, ...,   1,   0,   1],
       [111,  48,  16, ...,   1,   0,   1],
       [113,  35,  10, ...,   1,   0,   1],
       ...,
       [162,  26,  12, ...,   1,   0,   1],
       [100,  79,   7, ...,   1,   0,   1],
       [134,  37,   0, ...,   1,   0,   1]], dtype=int64)

In [11]:
Y = data.Sales.copy(deep=True)
print(Y.dtype)
Y

float64


0       9.50
1      11.22
2      10.06
3       7.40
4       4.15
       ...  
395    12.57
396     6.14
397     7.41
398     5.94
399     9.71
Name: Sales, Length: 400, dtype: float64

In [12]:
Y.mean()

7.496325

In [13]:
for i in range(len(Y)):
    if Y[i] <= 7.496325:
        Y[i] = 'low'
    else:
        Y[i] = 'high'

In [14]:
Y

0      high
1      high
2      high
3       low
4       low
       ... 
395    high
396     low
397     low
398     low
399    high
Name: Sales, Length: 400, dtype: object

In [15]:
Y.value_counts()

low     201
high    199
Name: Sales, dtype: int64

In [16]:
Y = Y.values

## BaggingClassifier

In [17]:
Bcl = BaggingClassifier()
kfold = KFold(n_splits=5)
results = cross_val_score(Bcl,X,Y,cv=kfold,verbose=True)
results

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.1s finished


array([0.85  , 0.7375, 0.8375, 0.7125, 0.825 ])

In [18]:
results.mean(),results.std()

(0.7924999999999999, 0.05623610939600994)

In [19]:
Bcl.fit(X,Y)
Bcl.score(X,Y)

0.9925

In [20]:
confusion_matrix(Y,Bcl.predict(X))

array([[199,   0],
       [  3, 198]], dtype=int64)

In [21]:
Bcl.n_features_

12

In [22]:
Bcl.n_features_in_

12

Accuracy of baggingClassifier is good but we can not extract feature importances

## RandomForest

In [23]:
kfold = KFold(n_splits=5)
RFC = RandomForestClassifier(n_estimators=100,bootstrap=True,class_weight=None,random_state=None,max_features='auto')
results2 = cross_val_score(RFC,X,Y,cv=kfold)

In [24]:
results2

array([0.875 , 0.7375, 0.825 , 0.8   , 0.825 ])

In [25]:
results2.mean(),results2.std()

(0.8125, 0.04472135954999577)

In [26]:
RFC.fit(X,Y)
RFC.feature_importances_

array([0.11384291, 0.0925402 , 0.09626579, 0.08207381, 0.25427389,
       0.14364525, 0.12296124, 0.04992886, 0.01218611, 0.01139178,
       0.01053804, 0.01035214])

In [27]:
RFC.score(X,Y)

1.0

In [28]:
confusion_matrix(Y,RFC.predict(X))

array([[199,   0],
       [  0, 201]], dtype=int64)

In [29]:
for i,b in enumerate(data.columns[1:]):
    print(i,b,f'           -- {RFC.feature_importances_[i]:.2f}%')

0 CompPrice            -- 0.11%
1 Income            -- 0.09%
2 Advertising            -- 0.10%
3 Population            -- 0.08%
4 Price            -- 0.25%
5 ShelveLoc            -- 0.14%
6 Age            -- 0.12%
7 Education            -- 0.05%
8 Urban_No            -- 0.01%
9 Urban_Yes            -- 0.01%
10 US_No            -- 0.01%
11 US_Yes            -- 0.01%


## Conclusion

    Price, ShelveLoc, Age&CompPrice influence the sales.

## Using PPSCORE to calculate Feature Importance

In [30]:
import ppscore

In [31]:
pps = ppscore.matrix(data)
pps

Unnamed: 0,x,y,ppscore,case,is_valid_score,metric,baseline_score,model_score,model
0,Sales,Sales,1.0,predict_itself,True,,0.0000,1.000000,
1,Sales,CompPrice,0.0,regression,True,mean absolute error,12.2150,17.223333,DecisionTreeRegressor()
2,Sales,Income,0.0,regression,True,mean absolute error,23.6325,30.317083,DecisionTreeRegressor()
3,Sales,Advertising,0.0,regression,True,mean absolute error,5.7300,6.332500,DecisionTreeRegressor()
4,Sales,Population,0.0,regression,True,mean absolute error,127.0450,165.681250,DecisionTreeRegressor()
...,...,...,...,...,...,...,...,...,...
164,US_Yes,Education,0.0,regression,True,mean absolute error,2.2850,2.304601,DecisionTreeRegressor()
165,US_Yes,Urban_No,0.0,regression,True,mean absolute error,0.2950,0.416724,DecisionTreeRegressor()
166,US_Yes,Urban_Yes,0.0,regression,True,mean absolute error,0.2950,0.416724,DecisionTreeRegressor()
167,US_Yes,US_No,1.0,regression,True,mean absolute error,0.3550,0.000000,DecisionTreeRegressor()


In [32]:
pps[pps.y == 'Sales']

Unnamed: 0,x,y,ppscore,case,is_valid_score,metric,baseline_score,model_score,model
0,Sales,Sales,1.0,predict_itself,True,,0.0,1.0,
13,CompPrice,Sales,0.0,regression,True,mean absolute error,2.262175,2.725607,DecisionTreeRegressor()
26,Income,Sales,0.0,regression,True,mean absolute error,2.262175,2.680521,DecisionTreeRegressor()
39,Advertising,Sales,0.0,regression,True,mean absolute error,2.262175,2.307301,DecisionTreeRegressor()
52,Population,Sales,0.0,regression,True,mean absolute error,2.262175,3.174025,DecisionTreeRegressor()
65,Price,Sales,0.0,regression,True,mean absolute error,2.262175,2.357588,DecisionTreeRegressor()
78,ShelveLoc,Sales,0.15029,regression,True,mean absolute error,2.262175,1.922194,DecisionTreeRegressor()
91,Age,Sales,0.0,regression,True,mean absolute error,2.262175,2.493357,DecisionTreeRegressor()
104,Education,Sales,0.0,regression,True,mean absolute error,2.262175,2.319058,DecisionTreeRegressor()
117,Urban_No,Sales,0.0,regression,True,mean absolute error,2.262175,2.277412,DecisionTreeRegressor()


In [33]:
data

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban_No,Urban_Yes,US_No,US_Yes
0,9.50,138,73,11,276,120,0,42,17,0,1,0,1
1,11.22,111,48,16,260,83,2,65,10,0,1,0,1
2,10.06,113,35,10,269,80,1,59,12,0,1,0,1
3,7.40,117,100,4,466,97,1,55,14,0,1,0,1
4,4.15,141,64,3,340,128,0,38,13,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,12.57,138,108,17,203,128,2,33,14,0,1,0,1
396,6.14,139,23,3,37,120,1,55,11,1,0,0,1
397,7.41,162,26,12,368,159,1,40,18,0,1,0,1
398,5.94,100,79,7,284,95,0,50,12,0,1,0,1


In [34]:
for i in range(len(data.Sales)):
    if data.Sales.loc[i] <= 7.496325:
        data.Sales.loc[i] = 'low'
    else:
        data.Sales.loc[i] = 'high'
        
data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban_No,Urban_Yes,US_No,US_Yes
0,high,138,73,11,276,120,0,42,17,0,1,0,1
1,high,111,48,16,260,83,2,65,10,0,1,0,1
2,high,113,35,10,269,80,1,59,12,0,1,0,1
3,low,117,100,4,466,97,1,55,14,0,1,0,1
4,low,141,64,3,340,128,0,38,13,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,high,138,108,17,203,128,2,33,14,0,1,0,1
396,low,139,23,3,37,120,1,55,11,1,0,0,1
397,low,162,26,12,368,159,1,40,18,0,1,0,1
398,low,100,79,7,284,95,0,50,12,0,1,0,1


In [35]:
pps1 = ppscore.matrix(data,sorted=True)
pps1

Unnamed: 0,x,y,ppscore,case,is_valid_score,metric,baseline_score,model_score,model
0,Sales,Sales,1.0,predict_itself,True,,0.0000,1.000000,
1,CompPrice,CompPrice,1.0,predict_itself,True,,0.0000,1.000000,
2,Income,Income,1.0,predict_itself,True,,0.0000,1.000000,
3,Advertising,Advertising,1.0,predict_itself,True,,0.0000,1.000000,
4,Population,Population,1.0,predict_itself,True,,0.0000,1.000000,
...,...,...,...,...,...,...,...,...,...
164,US_Yes,ShelveLoc,0.0,regression,True,mean absolute error,0.4525,0.479758,DecisionTreeRegressor()
165,US_Yes,Age,0.0,regression,True,mean absolute error,13.8775,13.956625,DecisionTreeRegressor()
166,US_Yes,Education,0.0,regression,True,mean absolute error,2.2850,2.304601,DecisionTreeRegressor()
167,US_Yes,Urban_No,0.0,regression,True,mean absolute error,0.2950,0.416724,DecisionTreeRegressor()


In [36]:
pps1[pps1.y == 'Sales']

Unnamed: 0,x,y,ppscore,case,is_valid_score,metric,baseline_score,model_score,model
0,Sales,Sales,1.0,predict_itself,True,,0.0,1.0,
21,ShelveLoc,Sales,0.18399,classification,True,weighted F1,0.53,0.616475,DecisionTreeClassifier()
22,Advertising,Sales,0.142484,classification,True,weighted F1,0.53,0.596967,DecisionTreeClassifier()
23,Price,Sales,0.138027,classification,True,weighted F1,0.53,0.594873,DecisionTreeClassifier()
26,US_No,Sales,0.057244,classification,True,weighted F1,0.53,0.556905,DecisionTreeClassifier()
27,US_Yes,Sales,0.057244,classification,True,weighted F1,0.53,0.556905,DecisionTreeClassifier()
30,Age,Sales,0.025014,classification,True,weighted F1,0.53,0.541757,DecisionTreeClassifier()
47,CompPrice,Sales,0.0,classification,True,weighted F1,0.53,0.453366,DecisionTreeClassifier()
58,Income,Sales,0.0,classification,True,weighted F1,0.53,0.50486,DecisionTreeClassifier()
78,Population,Sales,0.0,classification,True,weighted F1,0.53,0.410668,DecisionTreeClassifier()


### Conclusion for PPSCORE

PPS score suggests that 'ShelveLoc', 'Advertising', 'Price' as prominent features to predict the Sales, meaning those are the features which the company has to focus if they are to increase their sales.

## Feature Importance using RFE

In [37]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV

In [38]:
model = LogisticRegression(max_iter=1000)
rfe = RFE(model,n_features_to_select=3)
fit = rfe.fit(X,Y)

In [39]:
fit.ranking_

array([ 5,  9,  3, 10,  4,  1,  7,  6,  2,  8,  1,  1])

In [40]:
print('Ranking          Feature name')
for i,b in enumerate(data.columns[1:]):
    print('  ',fit.ranking_[i],'             ', b,)

Ranking          Feature name
   5               CompPrice
   9               Income
   3               Advertising
   10               Population
   4               Price
   1               ShelveLoc
   7               Age
   6               Education
   2               Urban_No
   8               Urban_Yes
   1               US_No
   1               US_Yes


### Conclusion based on RFE

RFE using logistic regression identifies 'ShelveLoc', 'US_NO', 'US_Yes' as most important features. note that US_NO and US_Yes are both same columns. only one need to be considered.

## Feature Importance using SelectKbest

In [41]:
from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [42]:
test = SelectKBest(score_func=chi2,k=4)
fit = test.fit(X,Y)

In [43]:
set_printoptions(precision=3,floatmode='fixed')
print(fit.scores_)

[9.503e-01 6.938e+01 1.688e+02 1.573e+02 2.972e+02 2.940e+01 6.846e+01
 1.640e-01 6.253e-01 2.617e-01 5.245e+00 2.887e+00]


In [44]:
for i,b in enumerate(data.columns[1:]):
    print(f'{fit.scores_[i]:.3f}','    ',b)

0.950      CompPrice
69.378      Income
168.777      Advertising
157.270      Population
297.238      Price
29.402      ShelveLoc
68.459      Age
0.164      Education
0.625      Urban_No
0.262      Urban_Yes
5.245      US_No
2.887      US_Yes


## Conclusion based on Chi2 SelectKbest method

According to chi2 test of independence we can say that 'Price', 'Advertising', 'Population' are the features that have influence on Sales.

# Final Coments


All in all all the tests give different results. we can always take the mejority selected features as best features of the features that have high influence on sales.

In this case we can say that ShelveLoc, Price, are the two most important factors in influencing sales. Advertising and Income could be next.