In [41]:
from pathlib import Path
import pandas as pd
import plotly.express as px


# Load the data into pandas DataFrame
dir_path = Path("CPA_Data_IntroductionToMarketing_2024")
clv_data = pd.read_csv(dir_path / "clv.csv")
filter_user_data = pd.read_csv(dir_path / "filter_user.csv")

print(clv_data.head())
filter_user_data.head()

   cohort  user  time_year  time_month  subscription  content  genres  \
0       0     0       2017           6             1      4.0     2.0   
1       0     0       2017           7             1      7.0     2.0   
2       0     1       2017           6             1      5.0     4.0   
3       0     1       2017           7             1      9.0     4.0   
4       0     2       2017           6             1      7.0     3.0   

   recency_new  bounce  
0       0.1429  0.2857  
1       0.1667  0.2500  
2       0.6000  1.0000  
3       0.8000  0.5000  
4       0.6250  0.6250  


Unnamed: 0,title,recency_new,duration_long,genre_family_comedy,genre_drama,genre_action_adventure,genre_thriller_crime,genre_documentary
0,6 Days,1,1,0,0,0,0,1
1,8 Mile,0,1,0,1,0,0,0
2,RuPaul's Drag Race Holi-Slay Spectacular,1,0,1,0,0,0,0
3,Russia's Toughest Prisons,0,0,0,0,0,0,1
4,SPF-18,1,0,1,0,0,0,0


In [42]:
# 2a
start = clv_data[clv_data.time_year == 2017].time_month.min()
end = clv_data[clv_data.time_year == 2018].time_month.max()
nunique = clv_data["user"].nunique()
print((start, 2017), (end, 2018), nunique)

(np.int64(6), 2017) (np.int64(5), 2018) 20850


In [106]:
# 3a
clv_data_cohort_0 = clv_data[clv_data.cohort == 0]
initial_cohort_size = clv_data_cohort_0.user.nunique()
active_users_per_month = clv_data_cohort_0.groupby(
    ["time_year", "time_month"], as_index=False
)["user"].nunique()

retention_rate_time = pd.DataFrame()
retention_rate_time["time"] = (
    active_users_per_month[["time_month", "time_year"]]
    .astype(str)
    .agg("/".join, axis=1)
)
retention_rate_time["retention_rate"] = (
    active_users_per_month.user / initial_cohort_size
) * 100

# 3b
fig = px.line(
    x=retention_rate_time["time"],
    y=retention_rate_time["retention_rate"],
    labels={"x": "Time (Month/Year)", "y": "Retention Rate (%)"},
    title="Retention Rate over Time",
)
fig.show()
retention_rate_time

Unnamed: 0,time,retention_rate
0,6/2017,100.0
1,7/2017,71.64533
2,8/2017,9.435752
3,9/2017,1.422475
4,10/2017,0.711238
5,11/2017,0.331911
6,12/2017,0.331911
7,1/2018,0.237079
8,2/2018,0.189663
9,3/2018,0.047416


In [47]:
# 8
filter_user_data.head()

Unnamed: 0,title,recency_new,duration_long,genre_family_comedy,genre_drama,genre_action_adventure,genre_thriller_crime,genre_documentary
0,6 Days,1,1,0,0,0,0,1
1,8 Mile,0,1,0,1,0,0,0
2,RuPaul's Drag Race Holi-Slay Spectacular,1,0,1,0,0,0,0
3,Russia's Toughest Prisons,0,0,0,0,0,0,1
4,SPF-18,1,0,1,0,0,0,0


In [130]:
# 8c
genre_columns = [
    "genre_family_comedy",
    "genre_drama",
    "genre_action_adventure",
    "genre_thriller_crime",
    "genre_documentary",
]

attributes = ["recency_new", "duration_long"] + genre_columns
conjoint_attribute_level_counts = (
    filter_user_data.groupby(attributes, as_index=False)
    .size()
    .sort_values(by=["size"], ascending=False)
    .rename(columns={"size": "counts"})
)
total_count = conjoint_attribute_level_counts.counts.sum()
conjoint_attribute_level_counts["streaming frequency"] = (
    conjoint_attribute_level_counts.counts / total_count
)

conjoint_attribute_level_counts["genre"] = (
    filter_user_data[genre_columns].idxmax(axis=1).str.replace("genre_", "")
)
conjoint_attribute_level_counts
# Note all other attribute levels have frequency 0

Unnamed: 0,recency_new,duration_long,genre_family_comedy,genre_drama,genre_action_adventure,genre_thriller_crime,genre_documentary,counts,streaming frequency,genre
4,0,1,1,0,0,0,0,5,0.25,family_comedy
2,0,1,0,0,1,0,0,2,0.1,family_comedy
6,1,0,1,0,0,0,0,2,0.1,family_comedy
10,1,1,0,1,0,0,0,2,0.1,family_comedy
11,1,1,1,0,0,0,0,2,0.1,drama
0,0,0,0,0,0,0,1,1,0.05,documentary
1,0,0,1,0,0,0,0,1,0.05,drama
3,0,1,0,1,0,0,0,1,0.05,documentary
5,1,0,0,0,0,1,0,1,0.05,family_comedy
7,1,1,0,0,0,0,1,1,0.05,family_comedy


In [131]:
import statsmodels.api as sm

# Independent variables
X = conjoint_attribute_level_counts[attributes]
# Dependend variable
y = conjoint_attribute_level_counts["streaming frequency"]

# Add a constant to the model (intercept)
# X = sm.add_constant(X)

# Perform linear regression
model = sm.OLS(y, X).fit()

# Display the regression results
print(model.summary())

# Extract the part-worths (coefficients)
part_worths = model.params
ranges = {
    "recency_new": abs(part_worths["recency_new"]),
    "duration_long": abs(part_worths["duration_long"]),
    "genre": abs(part_worths[genre_columns].max() - part_worths[genre_columns].min()),
}

total_range = sum(ranges.values())

relative_importance = {
    key: (value / total_range) * 100 for key, value in ranges.items()
}

print("Part-Worths:\n", part_worths)
print("\nRelative Importance [%]:\n", relative_importance)

                             OLS Regression Results                            
Dep. Variable:     streaming frequency   R-squared:                       0.519
Model:                             OLS   Adj. R-squared:                 -0.058
Method:                  Least Squares   F-statistic:                    0.8997
Date:                 Mon, 14 Oct 2024   Prob (F-statistic):              0.558
Time:                         17:16:45   Log-Likelihood:                 22.111
No. Observations:                   12   AIC:                            -30.22
Df Residuals:                        5   BIC:                            -26.83
Df Model:                            6                                         
Covariance Type:             nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
recency_new       


`kurtosistest` p-value may be inaccurate with fewer than 20 observations; only n=12 observations were given.

