In [1]:
import boto3
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from scipy.stats import pearsonr, spearmanr
from shared_utilities import helpers

In [2]:
! aws sso login --profile Stellaralgo-DataScienceAdmin

Attempting to automatically open the SSO authorization page in your default browser.
If the browser does not open or you wish to use a different device to authorize this request, open the following URL:

https://device.sso.us-east-1.amazonaws.com/

Then enter the code:

GMCG-HWLK
Successfully logged into Start URL: https://stellaralgo.awsapps.com/start#/


In [3]:
session = boto3.setup_default_session(profile_name='Stellaralgo-DataScienceAdmin')

In [4]:
conn = helpers.get_redshift_connection("qa-app", "stlrnhlpanthers")

Authorized as AROASQ4JELIXYLYV6P4UV:pmorrison@stellaralgo.com


In [5]:
cursor = conn.cursor()
sql = "CALL ds.getretentionmodeldata(93, 2021, 2022, 'temp_cursor')"
cursor.execute(sql)

In [6]:
temp_cursor = conn.cursor("temp_cursor")
data = temp_cursor.fetchall()

In [7]:
df_original = pd.DataFrame(data=data, columns=[desc[0] for desc in temp_cursor.description])

In [8]:
cursor.close()
temp_cursor.close()
conn.close()

In [9]:
df = df_original.copy()

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 487 entries, 0 to 486
Data columns (total 24 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   lkupclientid         487 non-null    int64         
 1   clientcode           487 non-null    object        
 2   dimcustomermasterid  487 non-null    int64         
 3   year                 487 non-null    int64         
 4   productgrouping      487 non-null    object        
 5   totalspent           487 non-null    float64       
 6   recentdate           487 non-null    datetime64[ns]
 7   attendancepercent    487 non-null    float64       
 8   renewedbeforedays    487 non-null    int64         
 9   source_tenure        487 non-null    object        
 10  tenure               487 non-null    int64         
 11  disttovenue          487 non-null    float64       
 12  recency              487 non-null    int64         
 13  missed_games_1       487 non-null  

## Two Ways to check correlation

1. Numpy
2. Pandas

In [11]:
print(np.corrcoef(df["tenure"], df["isnextyear_buyer"]))
print(df["tenure"].corr(df["isnextyear_buyer"]))

[[ 1. nan]
 [nan nan]]
nan


  c /= stddev[:, None]
  c /= stddev[None, :]


## Pearson & Spearman's correlations

In [12]:
print(pearsonr(df["tenure"], df["isnextyear_buyer"]))
print(spearmanr(df["tenure"], df["isnextyear_buyer"]))

(nan, nan)
SpearmanrResult(correlation=nan, pvalue=nan)




# Cleaning and Encoding

In [13]:
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop([feature_to_encode], axis=1)
    return(res) 

In [14]:
df_encoded = encode_and_bind(df, "productgrouping")
df_encoded = encode_and_bind(df_encoded, "gender")

In [15]:
# # df["recentdate"] = pd.to_datetime(df["recentdate"])
# df_encoded = df_encoded.drop(["recentdate"], axis=1)
# df_encoded["totalspent"] = df_encoded["totalspent"].astype("float")
# df_encoded["attendancepercent"] = df_encoded["attendancepercent"].astype("float")
# df_encoded["disttovenue"] = df_encoded["disttovenue"].astype("float")
# df_encoded["opentosendratio"] = df_encoded["opentosendratio"].astype("float")
# df_encoded["clicktosendratio"] = df_encoded["clicktosendratio"].astype("float")
# df_encoded["clicktoopenratio"] = df_encoded["clicktoopenratio"].astype("float")
# df_encoded["productgrouping_Flex Plan"] = df_encoded["productgrouping_Flex Plan"].astype("int")
# df_encoded["productgrouping_Full Season"] = df_encoded["productgrouping_Full Season"].astype("int")
# df_encoded["productgrouping_Half Season"] = df_encoded["productgrouping_Half Season"].astype("int")
# df_encoded["productgrouping_Premier/Core Plan"] = df_encoded["productgrouping_Premier/Core Plan"].astype("int")
# df_encoded["gender_Female"] = df_encoded["gender_Female"].astype("int")
# df_encoded["gender_Male"] = df_encoded["gender_Male"].astype("int")
# df_encoded["gender_Unknown"] = df_encoded["gender_Unknown"].astype("int")
# df_encoded["missed_games_1"] = df_encoded["missed_games_1"].astype("int")
# df_encoded["missed_games_2"] = df_encoded["missed_games_2"].astype("int")
# df_encoded["missed_games_over_2"] = df_encoded["missed_games_over_2"].astype("int")
# df_encoded["source_tenure"] = df_encoded["source_tenure"].astype("int")

KeyError: 'productgrouping_Full Season'

In [None]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5240 entries, 0 to 5239
Data columns (total 27 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   lkupclientid                       5240 non-null   int64  
 1   dimcustomermasterid                5240 non-null   int64  
 2   year                               5240 non-null   int64  
 3   totalspent                         5240 non-null   float64
 4   attendancepercent                  5240 non-null   float64
 5   renewedbeforedays                  5240 non-null   int64  
 6   source_tenure                      5240 non-null   int64  
 7   tenure                             5240 non-null   int64  
 8   disttovenue                        5240 non-null   float64
 9   recency                            5240 non-null   int64  
 10  missed_games_1                     5240 non-null   int64  
 11  missed_games_2                     5240 non-null   int64

In [None]:
df_encoded.head()

# Tests

This section runs tests for things like T-Test, PairedDifference T-Test, One Way ANOVA, Two Way ANOVA and Chi2.

In [19]:
import statsmodels.api as sm
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import scale

import researchpy as rp
from scipy import stats

In [23]:
# df["recentdate"] = pd.to_datetime(df["recentdate"])
df = df.drop(["recentdate"], axis=1)
df["totalspent"] = df["totalspent"].astype("float")
df["attendancepercent"] = df["attendancepercent"].astype("float")
df["disttovenue"] = df["disttovenue"].astype("float")
df["opentosendratio"] = df["opentosendratio"].astype("float")
df["clicktosendratio"] = df["clicktosendratio"].astype("float")
df["clicktoopenratio"] = df["clicktoopenratio"].astype("float")
# df["productgrouping_Flex Plan"] = df["productgrouping_Flex Plan"].astype("int")
# df["productgrouping_Full Season"] = df["productgrouping_Full Season"].astype("int")
# df["productgrouping_Half Season"] = df["productgrouping_Half Season"].astype("int")
# df["productgrouping_Premier/Core Plan"] = df["productgrouping_Premier/Core Plan"].astype("int")
# df["gender_Female"] = df["gender_Female"].astype("int")
# df["gender_Male"] = df["gender_Male"].astype("int")
# df["gender_Unknown"] = df["gender_Unknown"].astype("int")
df["missed_games_1"] = df["missed_games_1"].astype("int")
df["missed_games_2"] = df["missed_games_2"].astype("int")
df["missed_games_over_2"] = df["missed_games_over_2"].astype("int")
df["source_tenure"] = df["source_tenure"].astype("int")

In [24]:
df.head()

Unnamed: 0,lkupclientid,clientcode,dimcustomermasterid,year,productgrouping,totalspent,attendancepercent,renewedbeforedays,source_tenure,tenure,...,missed_games_2,missed_games_over_2,forward_records,opentosendratio,clicktosendratio,clicktoopenratio,gender,phonecall,inperson_contact,isnextyear_buyer
0,93,NHLPANTHERS,1866508,2021,Flex Plan,384.0,1.0,59,2190,160,...,0,0,0,0.54,0.19,0.36,Male,11,0,0
1,93,NHLPANTHERS,1937569,2021,Flex Plan,1556.0,1.0,5,1825,124,...,0,0,0,0.0,0.0,0.0,Female,18,0,0
2,93,NHLPANTHERS,2180018,2021,Flex Plan,904.0,0.18,15,365,174,...,0,1,20,0.0,0.0,0.0,Unknown,8,0,0
3,93,NHLPANTHERS,2320039,2021,Flex Plan,748.0,1.11,47,3650,198,...,0,0,0,0.03,0.0,0.0,Male,14,0,0
4,93,NHLPANTHERS,2172693,2021,Flex Plan,945.0,0.6,16,3650,88,...,0,0,4,1.0,0.0,0.0,Unknown,22,0,0


In [25]:
sample_01 = df[(df['gender'] == "Male")]

sample_02 = df[(df['gender'] == "Female")]

In [26]:
stats.levene(sample_01["opentosendratio"], sample_02["opentosendratio"])

LeveneResult(statistic=1.2551792891609728, pvalue=0.2637141018084891)

In [27]:
stats.levene(sample_01["attendancepercent"], sample_02["attendancepercent"])

LeveneResult(statistic=0.3431764719076128, pvalue=0.5585659926116238)

In [28]:
stats.levene(sample_01["totalspent"], sample_02["totalspent"])

LeveneResult(statistic=0.5092564714611473, pvalue=0.47617203949306464)