<a href="https://colab.research.google.com/github/rena-ds/hypothesis-custpersonality/blob/main/Hypothesis_Testing_Concepts_on_Customer_Personality_Analysis_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Import libraries
import pandas as pd
import os
import zipfile
import numpy as np
import scipy.stats as st
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
#Set directory for Kaggle file
os.environ['KAGGLE_CONFIG_DIR'] = os.path.expanduser('~/.kaggle')

#Command to download dataset
!kaggle datasets download -d "imakash3011/customer-personality-analysis"

#Extract ZIP file with zipfile
with zipfile.ZipFile("customer-personality-analysis.zip", 'r') as zip_ref:
    zip_ref.extractall()

#Read dataset file(CSV) and separate with tab \t
df = pd.read_csv("marketing_campaign.csv", sep = "\t")

Dataset URL: https://www.kaggle.com/datasets/imakash3011/customer-personality-analysis
License(s): CC0-1.0
customer-personality-analysis.zip: Skipping, found more recently modified local copy (use --force to force download)


In [3]:
#Display the first 5 rows and the last 5 rows of a DataFrame
df

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,04-09-2012,58,635,...,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,08-03-2014,38,11,...,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,21-08-2013,26,426,...,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,10-02-2014,26,11,...,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,19-01-2014,94,173,...,5,0,0,0,0,0,0,3,11,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,10870,1967,Graduation,Married,61223.0,0,1,13-06-2013,46,709,...,5,0,0,0,0,0,0,3,11,0
2236,4001,1946,PhD,Together,64014.0,2,1,10-06-2014,56,406,...,7,0,0,0,1,0,0,3,11,0
2237,7270,1981,Graduation,Divorced,56981.0,0,0,25-01-2014,91,908,...,6,0,1,0,0,0,0,3,11,0
2238,8235,1956,Master,Together,69245.0,0,1,24-01-2014,8,428,...,3,0,0,0,0,0,0,3,11,0


In [4]:
#Displays the number of rows and columns of a DataFrame
df.shape

(2240, 29)

In [5]:
#Displays DataFrame structure information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   2240 non-null   int64  
 1   Year_Birth           2240 non-null   int64  
 2   Education            2240 non-null   object 
 3   Marital_Status       2240 non-null   object 
 4   Income               2216 non-null   float64
 5   Kidhome              2240 non-null   int64  
 6   Teenhome             2240 non-null   int64  
 7   Dt_Customer          2240 non-null   object 
 8   Recency              2240 non-null   int64  
 9   MntWines             2240 non-null   int64  
 10  MntFruits            2240 non-null   int64  
 11  MntMeatProducts      2240 non-null   int64  
 12  MntFishProducts      2240 non-null   int64  
 13  MntSweetProducts     2240 non-null   int64  
 14  MntGoldProds         2240 non-null   int64  
 15  NumDealsPurchases    2240 non-null   i

In [6]:
#Displays the number of missing values ​​from each column
df.isna().sum()

Unnamed: 0,0
ID,0
Year_Birth,0
Education,0
Marital_Status,0
Income,24
Kidhome,0
Teenhome,0
Dt_Customer,0
Recency,0
MntWines,0


In [7]:
#Remove missing values ​​from 'Income'
df = df.dropna(subset = ['Income'])
df.isna().sum()

Unnamed: 0,0
ID,0
Year_Birth,0
Education,0
Marital_Status,0
Income,0
Kidhome,0
Teenhome,0
Dt_Customer,0
Recency,0
MntWines,0


# Probability mass function of NumWebVisitsMonth

In [8]:
#Calculate Slovin value
N = 2240
e = 0.05
slovin = N / (1 + N * e**2)
def slovin(N,e):
  return N / (1 + N * e**2)
slovin(N,e)

339.3939393939393

In [9]:
#Randomly took a subset of data consisting of 340 rows
df_sampel = df.sample(n = 340)
df_sampel

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
1562,9530,1988,Graduation,Married,24645.0,1,0,01-11-2012,16,5,...,8,0,0,0,0,0,0,3,11,0
1449,1927,1973,2n Cycle,Together,69401.0,0,1,23-03-2014,41,399,...,2,0,0,0,0,0,0,3,11,0
418,4216,1981,Graduation,Single,91065.0,0,0,22-02-2013,33,822,...,3,0,0,1,1,0,0,3,11,1
1624,7019,1963,Graduation,Together,54414.0,1,1,23-01-2013,49,109,...,4,0,0,0,0,0,0,3,11,0
64,4137,1948,Graduation,Together,70666.0,0,0,06-12-2013,29,398,...,4,0,0,0,0,0,0,3,11,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
385,948,1971,Graduation,Single,10245.0,1,0,15-05-2013,32,4,...,5,0,0,0,0,0,0,3,11,0
1945,10868,1951,Graduation,Widow,70792.0,0,0,06-02-2013,82,344,...,3,0,0,0,0,0,0,3,11,0
1758,5247,1955,PhD,Married,38725.0,1,1,10-05-2014,52,31,...,4,0,0,0,0,0,0,3,11,0
1776,7433,1985,Graduation,Single,29760.0,1,0,29-08-2012,87,64,...,8,0,0,0,0,0,0,3,11,0


In [10]:
#Calculate probability mass function of NumWebVisitsMonth
pmf = len(df_sampel)/len(df)
pmf

0.15342960288808663

The probability mass function for NumWebVisitsMonth is 0.15178571428571427.

# Comparison of the probability mass function of a customer having 5 monthly web visits with a customer having more than 5 visits

In [11]:
#Calculate the total number of customers who have five monthly web visits
fivewebvisits = (df['NumWebVisitsMonth'] == 5).sum()
fivewebvisits

279

In [12]:
#Calculate the probability mass function of customers who have five monthly web visits
pmf_fivewebvisits = fivewebvisits / len(df)
pmf_fivewebvisits

0.1259025270758123

In [13]:
#Calculate the total number of customers who have more than five monthly web visits
overfivewebvisits = (df['NumWebVisitsMonth'] > 5).sum()
overfivewebvisits

1156

In [14]:
#Calculate the probability mass function of customers who have more than five monthly web visits
pmf_overfivewebvisits = overfivewebvisits / len(df)
pmf_overfivewebvisits

0.5216606498194946

The probability mass function of customers having more than 5 web visits (0.52166064981949467) is higher compared to customers having 5 web visits (0.12544642857142857) so it can be concluded that most customers are active in using the service.

# Hypothesis testing to prove whether education has an effect on income or not

H0: education has no effect on income <br>
H1: education has an effect on income

In [15]:
#Displays the number of unique values in 'Education'
df['Education'].value_counts()

Unnamed: 0_level_0,count
Education,Unnamed: 1_level_1
Graduation,1116
PhD,481
Master,365
2n Cycle,200
Basic,54


In [16]:
#Filter DataFrame by category in "Education"
df_graduation = df[df['Education']=='Graduation']
df_phd = df[df['Education'] == 'PhD']
df_master = df[df['Education'] == 'Master']
df_2ncycle = df[df['Education'] == '2n Cycle']
df_basic = df[df['Education'] == 'Basic']

In [17]:
#One-way ANOVA to test whether there is a significant difference in average income (Income) based on education level (Education)
anova_test = st.f_oneway(df_graduation['Income'],
                         df_phd['Income'],
                         df_master['Income'],
                         df_2ncycle['Income'],
                         df_basic['Income'])
anova_test

F_onewayResult(statistic=27.73918235234821, pvalue=1.6677281316366318e-22)

- Statistics ➡ show the extent to which the variation between groups is greater than the variation within groups. The larger the value, the greater the likelihood that there will be differences between groups.
- Pvalue ➡ shows whether the difference is statistically significant. If the p-value > 0.05, then there is insufficient evidence to conclude that there is a significant difference.

In [18]:
#Displays the pvalue from the One-way ANOVA results
anova_test.pvalue

1.6677281316366318e-22

In [19]:
#Interpreting ANOVA test results based on p-value
if anova_test.pvalue > 0.05:
    print('Education has no effect on income')
else:
    print('Education has an effect on income')

Education has an effect on income
