## Import Libraries

In [1]:
import pandas as pd
from scipy import stats
from scipy.stats import chi2
import plotly.express as px

## Loading Data

In [2]:
data = pd.read_csv("supermarket_sales - Sheet1.csv")

In [3]:
data

Unnamed: 0,Invoice ID,Branch,City,Customer type,Gender,Product line,Unit price,Quantity,Tax 5%,Total,Date,Time,Payment,cogs,gross margin percentage,gross income,Rating
0,750-67-8428,A,Yangon,Member,Female,Health and beauty,74.69,7,26.1415,548.9715,1/5/2019,13:08,Ewallet,522.83,4.761905,26.1415,9.1
1,226-31-3081,C,Naypyitaw,Normal,Female,Electronic accessories,15.28,5,3.8200,80.2200,3/8/2019,10:29,Cash,76.40,4.761905,3.8200,9.6
2,631-41-3108,A,Yangon,Normal,Male,Home and lifestyle,46.33,7,16.2155,340.5255,3/3/2019,13:23,Credit card,324.31,4.761905,16.2155,7.4
3,123-19-1176,A,Yangon,Member,Male,Health and beauty,58.22,8,23.2880,489.0480,1/27/2019,20:33,Ewallet,465.76,4.761905,23.2880,8.4
4,373-73-7910,A,Yangon,Normal,Male,Sports and travel,86.31,7,30.2085,634.3785,2/8/2019,10:37,Ewallet,604.17,4.761905,30.2085,5.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,233-67-5758,C,Naypyitaw,Normal,Male,Health and beauty,40.35,1,2.0175,42.3675,1/29/2019,13:46,Ewallet,40.35,4.761905,2.0175,6.2
996,303-96-2227,B,Mandalay,Normal,Female,Home and lifestyle,97.38,10,48.6900,1022.4900,3/2/2019,17:16,Ewallet,973.80,4.761905,48.6900,4.4
997,727-02-1313,A,Yangon,Member,Male,Food and beverages,31.84,1,1.5920,33.4320,2/9/2019,13:22,Cash,31.84,4.761905,1.5920,7.7
998,347-56-2442,A,Yangon,Normal,Male,Home and lifestyle,65.82,1,3.2910,69.1110,2/22/2019,15:33,Cash,65.82,4.761905,3.2910,4.1


In [4]:
data.dtypes

Invoice ID                  object
Branch                      object
City                        object
Customer type               object
Gender                      object
Product line                object
Unit price                 float64
Quantity                     int64
Tax 5%                     float64
Total                      float64
Date                        object
Time                        object
Payment                     object
cogs                       float64
gross margin percentage    float64
gross income               float64
Rating                     float64
dtype: object

In [5]:
data.columns

Index(['Invoice ID', 'Branch', 'City', 'Customer type', 'Gender',
       'Product line', 'Unit price', 'Quantity', 'Tax 5%', 'Total', 'Date',
       'Time', 'Payment', 'cogs', 'gross margin percentage', 'gross income',
       'Rating'],
      dtype='object')

## Data Preprocessing

In [6]:
data['Date'] = pd.to_datetime(data['Date'])
data['Time'] = pd.to_datetime(data['Time'])

In [7]:
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month

In [8]:
data['Product line'].unique()

array(['Health and beauty', 'Electronic accessories',
       'Home and lifestyle', 'Sports and travel', 'Food and beverages',
       'Fashion accessories'], dtype=object)

In [9]:
data.Month.unique()

array([1, 3, 2])

## Hypotesis Testing

In [10]:
# membuat variabel dataset_tabel
dataset_table = pd.crosstab(data['Customer type'],data['gross income'])
print("Dataset Table: \n", dataset_table)
print("\n")

# membuat observed_values
observed_values = dataset_table.values
print("Observed Values: \n", observed_values)
print("\n")

# membuat fungsi menghitung statistik chi-kuadrat dan nilai-p untuk uji hipotesis independensi frekuensi yang diamati 
val = stats.chi2_contingency(dataset_table)

# menentukan expected values
expected_values = val[3]
print("Expected Values: \n", expected_values)
print("\n")

# menentukan panjang coloumns dan rows
no_of_rows=len(dataset_table.iloc[0:,0]) #(rows,)
no_of_columns=len(dataset_table.iloc[0, 0:]) #(,columns)

# mencari degree of freedom
ddof = (no_of_rows - 1)*(no_of_columns-1)
print("Degree of Freedom: \n", ddof)
print("\n")

# chi_square testing
chi_square=sum([(o-e)**2./e for o,e in zip(observed_values,expected_values)])

# menjumlahkan nilai chi_quare_statistic
chi_square_statistic = chi_square[0]+chi_square[1]
#print('Chi-square: \n', chi_square)
#print("\n")
#print('Chi-square statistic: \n', chi_square_statistic)
#print("\n")

# p-value
p_value= 1-chi2.cdf(x=chi_square_statistic, df=ddof)
print('P Value: \n', p_value)
print("\n")

# Confidence Interval
confidence_interval = 0.95
print('Confidence Interval: \n', confidence_interval)
print("\n")

# alpha
alpha = 0.05
print('Nilai kritis (alpha): \n', alpha)
print("\n")

# Menentukan hasil Hypothesis Testing
print("Hasil Hypothesis Testing: \n")
if p_value < alpha :
    print("Hypothesis (H0) ditolak, tidak adanya kemungkinan hubungan yang signifikan antara variabel Customer type dan gross income")
else:
    print("Hypothesis (H0) diterima, adanya kemungkinan hubungan yang signifikan antara variabel Customer type dan gross income")

Dataset Table: 
 gross income   0.5085   0.6045   0.6270   0.6390   0.6990   0.7670   0.7715   \
Customer type                                                                  
Member               1        0        1        0        1        0        1   
Normal               0        1        0        1        0        1        0   

gross income   0.7750   0.8140   0.8875   ...  45.3250  47.7200  47.7900  \
Customer type                             ...                              
Member               0        0        0  ...        1        0        0   
Normal               1        1        1  ...        0        1        1   

gross income   48.6050  48.6850  48.6900  48.7500  49.2600  49.4900  49.6500  
Customer type                                                                 
Member               1        0        0        0        1        0        1  
Normal               0        1        1        1        0        1        0  

[2 rows x 990 columns]


Observed Values