__DATA WRANGLING__

In [1]:
#load libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
#load dataset and assing it to a dataframe
dfmallcustomer= pd.read_csv("datasets/Mall_Customers.csv")

In [3]:
#check the first five data points
dfmallcustomer.head()

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


Appearently there are five columns but CustomerID is label variable so that I will not include that in my analysis. 

In [4]:
#I want to explore each variable. First Gender
dfmallcustomer['Gender'].value_counts(dropna=False, normalize=True)

Female    0.56
Male      0.44
Name: Gender, dtype: float64

There is no null variable within Gender. 56% of the customers are Female. Now, I want to check the shape of the dataframe before exploring other variables.

In [5]:
dfmallcustomer.shape

(200, 5)

In [6]:
# it seems there are 200 data points. Now, I want to check the distribution of customer ages. 
dfmallcustomer['Age'].value_counts(dropna=False, normalize=True)

32    0.055
35    0.045
19    0.040
31    0.040
30    0.035
49    0.035
27    0.030
47    0.030
40    0.030
23    0.030
36    0.030
38    0.030
50    0.025
48    0.025
29    0.025
21    0.025
20    0.025
34    0.025
18    0.020
28    0.020
59    0.020
24    0.020
67    0.020
54    0.020
39    0.015
25    0.015
33    0.015
22    0.015
37    0.015
43    0.015
68    0.015
45    0.015
46    0.015
60    0.015
41    0.010
57    0.010
66    0.010
65    0.010
63    0.010
58    0.010
26    0.010
70    0.010
42    0.010
53    0.010
52    0.010
51    0.010
44    0.010
55    0.005
64    0.005
69    0.005
56    0.005
Name: Age, dtype: float64

In [7]:
#I want to check the maximum and minimum values as well.
print('Maximum age: ', dfmallcustomer['Age'].max())
print('Minimum age: ', dfmallcustomer['Age'].min())

Maximum age:  70
Minimum age:  18


In [9]:
# Customer ages range between 18 and 70. I want to categorize customers based on their ages. 
#I will create labels for different ranges
dfmallcustomer['Age'] = pd.cut(dfmallcustomer.Age, bins=[0, 25, 35, 60, 70],labels=['Young Adult','Adult',
                                                                                     'Middle Age','Older Adult'])

In [10]:
dfmallcustomer.head()

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,Young Adult,15,39
1,2,Male,Young Adult,15,81
2,3,Female,Young Adult,16,6
3,4,Female,Young Adult,16,77
4,5,Female,Adult,17,40


In [11]:
#re-check the distributions of different ages
dfmallcustomer['Age'].value_counts(dropna=False, normalize=True)

Middle Age     0.425
Adult          0.300
Young Adult    0.190
Older Adult    0.085
Name: Age, dtype: float64

In [12]:
# most of the customers are middle ages who are between 35-60 years old, and adults who are between 
# 25-35 years old. Therefore, I can say that 72% of the customers are older than 25 and younger than 60 years old. 
#now, I will check the annual incomes

dfmallcustomer['Annual Income (k$)'].value_counts(dropna=False, normalize=True)

54     0.06
78     0.06
60     0.03
87     0.03
62     0.03
       ... 
61     0.01
126    0.01
59     0.01
58     0.01
15     0.01
Name: Annual Income (k$), Length: 64, dtype: float64

In [14]:
#Check the max and min annual incomes.

print("Maximum annual income: ", dfmallcustomer['Annual Income (k$)'].max())
print("Minimum annual income: ", dfmallcustomer['Annual Income (k$)'].min())

Maximum annual income:  137
Minimum annual income:  15


In [15]:
#minimum annual income is 15k while the maximum one is 137k.I want to group them as well based on their income
#The categories I will use are: lower, lower middle, middle, upper middle, and upper
dfmallcustomer['Annual Income (k$)'] = pd.cut(dfmallcustomer['Annual Income (k$)'],
                                              bins=[0, 20, 35, 65, 85, 137],labels=['Lower','Lower Middle', 'Middle',
                                                                                     'Upper Middle','Upper'])

In [16]:
dfmallcustomer['Annual Income (k$)'].value_counts(normalize=True, dropna=False)

Middle          0.40
Upper Middle    0.24
Upper           0.17
Lower Middle    0.11
Lower           0.08
Name: Annual Income (k$), dtype: float64

In [17]:
#the results above shows that 64% of the customers are middle and upper middle groups
# Therefore, the annual income of them range between 35k-85k. 
#Finally, I want to explore spending scores
dfmallcustomer['Spending Score (1-100)'].value_counts(dropna=False, normalize=True)

42    0.040
55    0.035
46    0.030
73    0.030
35    0.025
      ...  
31    0.005
82    0.005
24    0.005
23    0.005
99    0.005
Name: Spending Score (1-100), Length: 84, dtype: float64

In [18]:
#I want to create 4 groups for the spending scores: 0-24 Low Score, 25-49 Good Score, 50-74 Very Good Score
# 75-100 Excellent Score
dfmallcustomer['Spending Score (1-100)'] = pd.cut(dfmallcustomer['Spending Score (1-100)'],
                                              bins=[0, 25, 50, 75, 100],labels=['Low Score','Good Score',
                                                                                    'Very Good Score','Excellent Score'])

In [19]:
#recheck the spencing scores
dfmallcustomer['Spending Score (1-100)'].value_counts(dropna=False, normalize=True)

Good Score         0.320
Very Good Score    0.295
Low Score          0.195
Excellent Score    0.190
Name: Spending Score (1-100), dtype: float64

In [20]:
dfmallcustomer.head()

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,Young Adult,Lower,Good Score
1,2,Male,Young Adult,Lower,Excellent Score
2,3,Female,Young Adult,Lower,Low Score
3,4,Female,Young Adult,Lower,Excellent Score
4,5,Female,Adult,Lower,Good Score


In [21]:
#check one more time if there is any null values
dfmallcustomer.isnull().any().any()

False

In [23]:
#there is no null values. Most of the customers have good score or very good score.
#Finally, I want to save my dataset for further analysis.

dfmallcustomer.to_csv('./Datasets/dfmallcustomer.csv', index=False)
