In [41]:
from pathlib import Path
import pandas as pd
import plotly.express as px


# Load the data into pandas DataFrame
dir_path = Path("CPA_Data_IntroductionToMarketing_2024")
clv_data = pd.read_csv(dir_path / "clv.csv")
filter_user_data = pd.read_csv(dir_path / "filter_user.csv")

print(clv_data.head())
filter_user_data.head()

   cohort  user  time_year  time_month  subscription  content  genres  \
0       0     0       2017           6             1      4.0     2.0   
1       0     0       2017           7             1      7.0     2.0   
2       0     1       2017           6             1      5.0     4.0   
3       0     1       2017           7             1      9.0     4.0   
4       0     2       2017           6             1      7.0     3.0   

   recency_new  bounce  
0       0.1429  0.2857  
1       0.1667  0.2500  
2       0.6000  1.0000  
3       0.8000  0.5000  
4       0.6250  0.6250  


Unnamed: 0,title,recency_new,duration_long,genre_family_comedy,genre_drama,genre_action_adventure,genre_thriller_crime,genre_documentary
0,6 Days,1,1,0,0,0,0,1
1,8 Mile,0,1,0,1,0,0,0
2,RuPaul's Drag Race Holi-Slay Spectacular,1,0,1,0,0,0,0
3,Russia's Toughest Prisons,0,0,0,0,0,0,1
4,SPF-18,1,0,1,0,0,0,0


1. The assumption that Netflix has a contractual relationship with its customers is reasonable because Netflix operates on a subscription-based model, where users commit to recurring payments for continued access to the platform. In the SVOD industry, services like Netflix typically require users to create an account and agree to monthly or annual billing, which aligns with the definition of a contractual relationship. Although users can cancel their subscription at any time, the ongoing nature of the payment agreement and access to content makes this a contractual relationship. This setup facilitates a more predictable estimation of Customer Lifetime Value (CLV) compared to non-contractual models.


In [42]:
# 2a
start = clv_data[clv_data.time_year == 2017].time_month.min()
end = clv_data[clv_data.time_year == 2018].time_month.max()
nunique = clv_data["user"].nunique()
print((start, 2017), (end, 2018), nunique)

(np.int64(6), 2017) (np.int64(5), 2018) 20850


In [46]:
# 3a
clv_data_cohort_0 = clv_data[clv_data.cohort == 0]
initial_cohort_size = clv_data_cohort_0.user.nunique()
active_users_per_month = clv_data_cohort_0.groupby(
    ["time_year", "time_month"], as_index=False
)["user"].nunique()

retention_rate_time = pd.DataFrame()
retention_rate_time["time"] = (
    active_users_per_month[["time_month", "time_year"]]
    .astype(str)
    .agg("/".join, axis=1)
)
retention_rate_time["retention_rate"] = (
    active_users_per_month.user / initial_cohort_size
) * 100
print(retention_rate_time)

# 3b
fig = px.line(
    x=retention_rate_time["time"],
    y=retention_rate_time["retention_rate"],
    labels={"x": "Time (Month/Year)", "y": "Retention Rate (%)"},
    title="Retention Rate over Time",
)
fig.show()

       time  retention_rate
0    6/2017      100.000000
1    7/2017       71.645330
2    8/2017        9.435752
3    9/2017        1.422475
4   10/2017        0.711238
5   11/2017        0.331911
6   12/2017        0.331911
7    1/2018        0.237079
8    2/2018        0.189663
9    3/2018        0.047416
10   4/2018        0.047416
11   5/2018        0.047416


In [47]:
# 8
filter_user_data.head()

Unnamed: 0,title,recency_new,duration_long,genre_family_comedy,genre_drama,genre_action_adventure,genre_thriller_crime,genre_documentary
0,6 Days,1,1,0,0,0,0,1
1,8 Mile,0,1,0,1,0,0,0
2,RuPaul's Drag Race Holi-Slay Spectacular,1,0,1,0,0,0,0
3,Russia's Toughest Prisons,0,0,0,0,0,0,1
4,SPF-18,1,0,1,0,0,0,0


8a. Attributes and levels (Genre mutually exclusive):

Recency_New:

- 0
- 1

Duration_Long:

- 0
- 1

Genre:

- Comedy
- Drame
- Action_Adventure
- Triller_Crime
- Documentary

Total Number of product versions (fully factorial) = $2*2*5 = 20$


8b. Attributes and levels (genre not mutually exclusive):

Recency_New:

- 0
- 1

Duration_Long:

- 0
- 1

Genre:

- Comedy
- Comedy, Drama
- Comedy, Drama, Action_Adventure
- Comedy, ...
- Drame, ...
- Action_Adventure, ...
- Triller_Crime, ...
- Documentary, ...

Number of genres = $5*5 = 25$
Number of product versions (fully factorial) = $2*2*5*5 = 100$

Discussion:
In the mutually non-exclusive case the number of genres grows quadratically compared to the linear growth in the exclusive case. This implies that the attribute levels in a fully factorial design grows very quickly in the non-exclusive case.


In [94]:
# 8c
genre_columns = [
    "genre_family_comedy",
    "genre_drama",
    "genre_action_adventure",
    "genre_thriller_crime",
    "genre_documentary",
]

filter_user_data["genre"] = (
    filter_user_data[genre_columns].idxmax(axis=1).str.replace("genre_", "")
)
# filter_user_data_combined_genre = filter_user_data.drop(columns=genre_columns)

attributes = ["recency_new", "duration_long", "genre"]
conjoint_attribute_level_counts = (
    filter_user_data.groupby(attributes, as_index=False)
    .size()
    .sort_values(by=["size"], ascending=False)
    .rename(columns={"size": "counts"})
)
conjoint_attribute_level_counts["streaming frequency"] = (
    conjoint_attribute_level_counts.counts
    / conjoint_attribute_level_counts.counts.max()
)
conjoint_attribute_level_counts
# Note all other attribute levels have frequency 0

Unnamed: 0,recency_new,duration_long,genre,counts,streaming frequency
4,0,1,family_comedy,5,1.0
2,0,1,action_adventure,2,0.4
5,1,0,family_comedy,2,0.4
9,1,1,drama,2,0.4
10,1,1,family_comedy,2,0.4
0,0,0,documentary,1,0.2
1,0,0,family_comedy,1,0.2
3,0,1,drama,1,0.2
6,1,0,thriller_crime,1,0.2
7,1,1,action_adventure,1,0.2


In [105]:
import statsmodels.api as sm

df_dummies = pd.get_dummies(
    conjoint_attribute_level_counts, columns=["genre"]
)

X = df_dummies[
    [
        "recency_new",
        "duration_long",
    ]
    + genre_columns
]
y = conjoint_attribute_level_counts["streaming frequency"]

# Add a constant to the model (intercept)
# X = sm.add_constant(X)
print(X, y)
# Perform linear regression
model = sm.OLS(y, X).fit()

# Display the regression results
print(model.summary())

# Extract the part-worths (coefficients)
part_worths = model.params
ranges = {
    "recency_new": abs(part_worths["recency_new"]),
    "duration_long": abs(part_worths["duration_long"]),
    "genre": abs(
        part_worths[
            [
                "genre_action_adventure",
                "genre_drama",
                "genre_family_comedy",
                "genre_thriller_crime",
            ]
        ].max()
        - part_worths[
            [
                "genre_action_adventure",
                "genre_drama",
                "genre_family_comedy",
                "genre_thriller_crime",
            ]
        ].min()
    ),
}

total_range = sum(ranges.values())

relative_importance = {
    key: (value / total_range) * 100 for key, value in ranges.items()
}
print("Part-Worths:\n", part_worths)
print("\nRelative Importance:\n", relative_importance)

    recency_new  duration_long  genre_family_comedy  genre_drama  \
4             0              1                 True        False   
2             0              1                False        False   
5             1              0                 True        False   
9             1              1                False         True   
10            1              1                 True        False   
0             0              0                False        False   
1             0              0                 True        False   
3             0              1                False         True   
6             1              0                False        False   
7             1              1                False        False   
8             1              1                False        False   
11            1              1                False        False   

    genre_action_adventure  genre_thriller_crime  genre_documentary  
4                    False                 Fa

ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).