# Check

In [1]:
def check_df(dataframe, head=5):
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Head #####################")
    print(dataframe.head(head))
    print("##################### Tail #####################")
    print(dataframe.tail(head))
    print("##################### NA #####################")
    print(dataframe.isnull().sum())
    print("##################### Quantiles #####################")
    print(dataframe.describe([0, 0.05, 0.50, 0.95, 0.99, 1]).T)

# Outliers
0.01 ve 0.99 verisetine göre değişkenlik göstermesi beklenir

In [2]:
def outlier_thresholds(dataframe, variable):
    quartile1 = dataframe[variable].quantile(0.01)
    quartile3 = dataframe[variable].quantile(0.99)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit


def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    # dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

# EDA

In [3]:
def grab_col_names(dataframe, cat_th=10,  car_th=20):
    """
    Veri setindeki kategorik, numerik ve kategorik fakat kardinal değişkenlerin isimlerini verir.

    Parameters
    ----------
    dataframe: dataframe
        değişken isimleri alınmak istenen dataframe'dir.
    cat_th: int, float
        numerik fakat kategorik olan değişkenler için sınıf eşik değeri
    car_th: int, float
        kategorik fakat kardinal değişkenler için sınıf eşik değeri

    Returns
    -------
    cat_cols: list
        Kategorik değişken listesi
    num_cols: list
        Numerik değişken listesi
    cat_but_car: list
        Kategorik görünümlü kardinal değişken listesi

    Notes
    ------
    cat_cols + num_cols + cat_but_car = toplam değişken sayısı
    num_but_cat cat_cols'un içerisinde.

    """
    # cat_cols, cat_but_car
    cat_cols = [col for col in df.columns if str(df[col].dtypes) in ["category", "object", "bool"]]

    num_but_cat = [col for col in df.columns if df[col].nunique() < 10 and df[col].dtypes in ["int", "float"]]

    cat_but_car = [col for col in df.columns if
                   df[col].nunique() > 20 and str(df[col].dtypes) in ["category", "object"]]

    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    num_cols = [col for col in df.columns if df[col].dtypes in ["int", "float"]]
    num_cols = [col for col in num_cols if col not in cat_cols]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')

    return cat_cols, num_cols, cat_but_car

Alttaki fonksiyon kategorik değişkenlerin ağırlıklarını ve yüzdesel oranlarını gösteriyor

In [4]:
def cat_summary(dataframe, col_name, plot=False):
    print(pd.DataFrame({col_name: dataframe[col_name].value_counts(),
                        "Ratio": 100 * dataframe[col_name].value_counts() / len(dataframe)}))
    print("##########################################")

    if plot:
        sns.countplot(x=dataframe[col_name], data=dataframe)
        plt.show(block=True)

Alttaki fonksiyon numerik değerlerin describe değerlerini gösteriyor

In [5]:
def num_summary(dataframe, numerical_col, plot=False):
    quantiles = [0.05, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 0.99]
    print(dataframe[numerical_col].describe(quantiles).T)

    if plot:
        dataframe[numerical_col].hist()
        plt.xlabel(numerical_col)
        plt.title(numerical_col)
        plt.show(block=True)

In [6]:
def target_summary_with_num(dataframe, target, numerical_col):
    print(dataframe.groupby(target).agg({numerical_col: "mean"}), end="\n\n\n")

In [7]:
def target_summary_with_cat(dataframe, target, categorical_col):
    print(pd.DataFrame({"TARGET_MEAN": dataframe.groupby(categorical_col)[target].mean()}), end="\n\n\n")

Aşağıdaki fonksiyon yüksek korelasyona sahip olan değişkenlerden birini çıkarten liste oluşturur

In [8]:
def high_correlated_cols(dataframe, plot=False, corr_th=0.90):
    corr = dataframe.corr()
    cor_matrix = corr.abs()
    upper_triangle_matrix = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k=1).astype(bool))
    drop_list = [col for col in upper_triangle_matrix.columns if any(upper_triangle_matrix[col] > corr_th)]
    if plot:
        import seaborn as sns
        import matplotlib.pyplot as plt
        sns.set(rc={'figure.figsize': (15, 15)})
        sns.heatmap(corr, cmap="RdBu")
        plt.show()
    return drop_list

# high_correlated_cols(df)
# drop_list = high_correlated_cols(df, plot=True)
# df.drop(drop_list, axis=1)
# high_correlated_cols(df.drop(drop_list, axis=1), plot=True)
# sonrasında bu adımlar yapılarak dataframeden yuksek korelasyonlu değişken çıkartılabilir.

# RFM

In [9]:
def create_rfm(dataframe, csv=False):

    # VERIYI HAZIRLAMA Aşağıdaki veri hazırlama kısmı örnekte yapılan dataframe uygundur başka örneklerde değişebilir. 
    dataframe["TotalPrice"] = dataframe["Quantity"] * dataframe["Price"]
    dataframe.dropna(inplace=True)
    dataframe = dataframe[~dataframe["Invoice"].str.contains("C", na=False)]

    # RFM METRIKLERININ HESAPLANMASI bu kısım örneğe özeldir eldeki dataframe e göre değişebilir
    today_date = dt.datetime(2011, 12, 11) 
    rfm = dataframe.groupby('Customer ID').agg({'InvoiceDate': lambda date: (today_date - date.max()).days,
                                                'Invoice': lambda num: num.nunique(),
                                                "TotalPrice": lambda price: price.sum()})
    rfm.columns = ['recency', 'frequency', "monetary"]
    rfm = rfm[(rfm['monetary'] > 0)]

    # RFM SKORLARININ HESAPLANMASI
    rfm["recency_score"] = pd.qcut(rfm['recency'], 5, labels=[5, 4, 3, 2, 1])
    rfm["frequency_score"] = pd.qcut(rfm["frequency"].rank(method="first"), 5, labels=[1, 2, 3, 4, 5])
    rfm["monetary_score"] = pd.qcut(rfm['monetary'], 5, labels=[1, 2, 3, 4, 5])

    # cltv_df skorları kategorik değere dönüştürülüp df'e eklendi
    rfm["RFM_SCORE"] = (rfm['recency_score'].astype(str) +
                        rfm['frequency_score'].astype(str))


    # SEGMENTLERIN ISIMLENDIRILMESI
    seg_map = {
        r'[1-2][1-2]': 'hibernating',
        r'[1-2][3-4]': 'at_risk',
        r'[1-2]5': 'cant_loose',
        r'3[1-2]': 'about_to_sleep',
        r'33': 'need_attention',
        r'[3-4][4-5]': 'loyal_customers',
        r'41': 'promising',
        r'51': 'new_customers',
        r'[4-5][2-3]': 'potential_loyalists',
        r'5[4-5]': 'champions'
    }

    rfm['segment'] = rfm['RFM_SCORE'].replace(seg_map, regex=True)
    rfm = rfm[["recency", "frequency", "monetary", "segment"]]
    rfm.index = rfm.index.astype(int)

    if csv:
        rfm.to_csv("rfm.csv")

    return rfm

# CLTV
*  1. Veri Hazırlama
*  2. Average Order Value (average_order_value = total_price / total_transaction)
*  3. Purchase Frequency (total_transaction / total_number_of_customers)
*  4. Repeat Rate & Churn Rate (birden fazla alışveriş yapan müşteri sayısı / tüm müşteriler)
*  5. Profit Margin (profit_margin =  total_price * 0.10)
*  6. Customer Value (customer_value = average_order_value * purchase_frequency)
*  7. Customer Lifetime Value (CLTV = (customer_value / churn_rate) x profit_margin)
*  8. Segmentlerin Oluşturulması

In [10]:
def create_cltv_c(dataframe, profit=0.10):

    # Veriyi hazırlama
    dataframe = dataframe[~dataframe["Invoice"].str.contains("C", na=False)]
    dataframe = dataframe[(dataframe['Quantity'] > 0)]
    dataframe.dropna(inplace=True)
    dataframe["TotalPrice"] = dataframe["Quantity"] * dataframe["Price"]
    cltv_c = dataframe.groupby('Customer ID').agg({'Invoice': lambda x: x.nunique(),
                                                   'Quantity': lambda x: x.sum(),
                                                   'TotalPrice': lambda x: x.sum()})
    cltv_c.columns = ['total_transaction', 'total_unit', 'total_price']
    # avg_order_value
    cltv_c['avg_order_value'] = cltv_c['total_price'] / cltv_c['total_transaction']
    # purchase_frequency
    cltv_c["purchase_frequency"] = cltv_c['total_transaction'] / cltv_c.shape[0]
    # repeat rate & churn rate
    repeat_rate = cltv_c[cltv_c.total_transaction > 1].shape[0] / cltv_c.shape[0]
    churn_rate = 1 - repeat_rate
    # profit_margin
    cltv_c['profit_margin'] = cltv_c['total_price'] * profit
    # Customer Value
    cltv_c['customer_value'] = (cltv_c['avg_order_value'] * cltv_c["purchase_frequency"])
    # Customer Lifetime Value
    cltv_c['cltv'] = (cltv_c['customer_value'] / churn_rate) * cltv_c['profit_margin']
    # Segment
    cltv_c["segment"] = pd.qcut(cltv_c["cltv"], 4, labels=["D", "C", "B", "A"])

    return cltv_c

# CLTV-Prediction
*  1. Verinin Hazırlanması (Data Preperation)
*  2. BG-NBD Modeli ile Expected Number of Transaction
*  3. Gamma-Gamma Modeli ile Expected Average Profit
*  4. BG-NBD ve Gamma-Gamma Modeli ile CLTV'nin Hesaplanması
*  5. CLTV'ye Göre Segmentlerin Oluşturulması


In [11]:
def create_cltv_p(dataframe, month=3):
    # !pip install lifetimes
    import datetime as dt
    import pandas as pd
    import matplotlib.pyplot as plt
    from lifetimes import BetaGeoFitter
    from lifetimes import GammaGammaFitter
    from lifetimes.plotting import plot_period_transactions
    # 1. Veri Ön İşleme
    dataframe.dropna(inplace=True)
    dataframe = dataframe[~dataframe["Invoice"].str.contains("C", na=False)]
    dataframe = dataframe[dataframe["Quantity"] > 0]
    dataframe = dataframe[dataframe["Price"] > 0]
    replace_with_thresholds(dataframe, "Quantity")
    replace_with_thresholds(dataframe, "Price")
    dataframe["TotalPrice"] = dataframe["Quantity"] * dataframe["Price"]
    today_date = dt.datetime(2011, 12, 11)

    cltv_df = dataframe.groupby('Customer ID').agg(
        {'InvoiceDate': [lambda InvoiceDate: (InvoiceDate.max() - InvoiceDate.min()).days,
                         lambda InvoiceDate: (today_date - InvoiceDate.min()).days],
         'Invoice': lambda Invoice: Invoice.nunique(),
         'TotalPrice': lambda TotalPrice: TotalPrice.sum()})

    cltv_df.columns = cltv_df.columns.droplevel(0)
    cltv_df.columns = ['recency', 'T', 'frequency', 'monetary']
    cltv_df["monetary"] = cltv_df["monetary"] / cltv_df["frequency"]
    cltv_df = cltv_df[(cltv_df['frequency'] > 1)]
    cltv_df["recency"] = cltv_df["recency"] / 7
    cltv_df["T"] = cltv_df["T"] / 7

    # 2. BG-NBD Modelinin Kurulması
    bgf = BetaGeoFitter(penalizer_coef=0.001)
    bgf.fit(cltv_df['frequency'],
            cltv_df['recency'],
            cltv_df['T'])

    cltv_df["expected_purc_1_week"] = bgf.predict(1,
                                                  cltv_df['frequency'],
                                                  cltv_df['recency'],
                                                  cltv_df['T'])

    cltv_df["expected_purc_1_month"] = bgf.predict(4,
                                                   cltv_df['frequency'],
                                                   cltv_df['recency'],
                                                   cltv_df['T'])

    cltv_df["expected_purc_3_month"] = bgf.predict(12,
                                                   cltv_df['frequency'],
                                                   cltv_df['recency'],
                                                   cltv_df['T'])

    # 3. GAMMA-GAMMA Modelinin Kurulması
    ggf = GammaGammaFitter(penalizer_coef=0.01)
    ggf.fit(cltv_df['frequency'], cltv_df['monetary'])
    cltv_df["expected_average_profit"] = ggf.conditional_expected_average_profit(cltv_df['frequency'],
                                                                                 cltv_df['monetary'])

    # 4. BG-NBD ve GG modeli ile CLTV'nin hesaplanması.
    cltv = ggf.customer_lifetime_value(bgf,
                                       cltv_df['frequency'],
                                       cltv_df['recency'],
                                       cltv_df['T'],
                                       cltv_df['monetary'],
                                       time=month,  # 3 aylık
                                       freq="W",  # T'nin frekans bilgisi.
                                       discount_rate=0.01)

    cltv = cltv.reset_index()
    cltv_final = cltv_df.merge(cltv, on="Customer ID", how="left")
    cltv_final["segment"] = pd.qcut(cltv_final["clv"], 4, labels=["D", "C", "B", "A"])

    return cltv_final



# Rating
* Average
* Time-Based Weighted Average
* User-Based Weighted Average
* Weighted Rating
* Bayesian Average Rating Score

In [12]:
def time_based_weighted_average(dataframe, w1=28, w2=26, w3=24, w4=22):
    return dataframe.loc[df["days"] <= 30, "Rating"].mean() * w1 / 100 + \
           dataframe.loc[(dataframe["days"] > 30) & (dataframe["days"] <= 90), "Rating"].mean() * w2 / 100 + \
           dataframe.loc[(dataframe["days"] > 90) & (dataframe["days"] <= 180), "Rating"].mean() * w3 / 100 + \
           dataframe.loc[(dataframe["days"] > 180), "Rating"].mean() * w4 / 100


In [13]:
def user_based_weighted_average(dataframe, w1=22, w2=24, w3=26, w4=28):
    return dataframe.loc[dataframe["Progress"] <= 10, "Rating"].mean() * w1 / 100 + \
           dataframe.loc[(dataframe["Progress"] > 10) & (dataframe["Progress"] <= 45), "Rating"].mean() * w2 / 100 + \
           dataframe.loc[(dataframe["Progress"] > 45) & (dataframe["Progress"] <= 75), "Rating"].mean() * w3 / 100 + \
           dataframe.loc[(dataframe["Progress"] > 75), "Rating"].mean() * w4 / 100

In [14]:
def course_weighted_rating(dataframe, time_w=50, user_w=50):
    return time_based_weighted_average(dataframe) * time_w/100 + user_based_weighted_average(dataframe)*user_w/100

In [15]:
def bayesian_average_rating(n, confidence=0.95):
    if sum(n) == 0:
        return 0
    K = len(n)
    z = st.norm.ppf(1 - (1 - confidence) / 2)
    N = sum(n)
    first_part = 0.0
    second_part = 0.0
    for k, n_k in enumerate(n):
        first_part += (k + 1) * (n[k] + 1) / (N + K)
        second_part += (k + 1) * (k + 1) * (n[k] + 1) / (N + K)
    score = first_part - z * math.sqrt((second_part - first_part * first_part) / (N + K + 1))
    return score

# df["bar_score"] = df.apply(lambda x: bayesian_average_rating(x[["1_point",
#                                                                 "2_point",
#                                                                 "3_point",
#                                                                 "4_point",
#                                                                 "5_point"]]), axis=1)

# Sorting
* Sorting by Rating
* Sorting by Comment Count or Purchase Count
* Sorting by Rating, Comment and Purchase
* Sorting by Bayesian Average Rating Score (Sorting Products with 5 Star Rated) (yeni ürünlerin pazarda öne çıkabilmesine yarıyor) cunku çok sayıdakı puan alan urunlerın 5 yıldızlı dagılımı ınceler ve daha az sayıdakı yorumluları potansıyel gelebılecegı yerı tahmınler
* Hybrid Sorting: BAR Score + Diğer Faktorler

In [16]:
def weighted_sorting_score(dataframe, w1=32, w2=26, w3=42):
    
    df["purchase_count_scaled"] = MinMaxScaler(feature_range=(1, 5)). \
    fit(df[["purchase_count"]]). \
    transform(df[["purchase_count"]])

    df["comment_count_scaled"] = MinMaxScaler(feature_range=(1, 5)). \
    fit(df[["commment_count"]]). \
    transform(df[["commment_count"]])

    return (dataframe["comment_count_scaled"] * w1 / 100 +
            dataframe["purchase_count_scaled"] * w2 / 100 +
            dataframe["rating"] * w3 / 100)

# df["weighted_sorting_score"] = weighted_sorting_score(df)

In [17]:
def hybrid_sorting_score(dataframe, bar_w=60, wss_w=40):
    bar_score = dataframe.apply(lambda x: bayesian_average_rating(x[["1_point",
                                                                     "2_point",
                                                                     "3_point",
                                                                     "4_point",
                                                                     "5_point"]]), axis=1)
    wss_score = weighted_sorting_score(dataframe)

    return bar_score*bar_w/100 + wss_score*wss_w/100


# df["hybrid_sorting_score"] = hybrid_sorting_score(df)

# df.sort_values("hybrid_sorting_score", ascending=False).head(20)

In [18]:
# vote_average * vote_count hızlı yakşalım için ayrı basit bir yöntem
########################

# df["average_count_score"] = df["vote_average"] * df["vote_count_score"]

# df.sort_values("average_count_score", ascending=False).head(20)

# IMDB Eski sıralama yöntemi

* r = vote average
* v = vote count
* M = minimum votes required to be listed in the Top 250
* C = the mean vote across the whole report (currently 7.0)

In [19]:
# M = 2500
# C = df['vote_average'].mean()

def weighted_rating(r, v, M, C):
    return (v / (v + M) * r) + (M / (v + M) * C)

# df["weighted_rating"] = weighted_rating(df["vote_average"],
#                                         df["vote_count"], M, C)
# df.sort_values("average_count_score", ascending=False).head(10)

Aşağıdaki fonksiyonda zor olan kısım çağırılma şekilleri

In [20]:
def bayesian_average_rating(n, confidence=0.95):
    if sum(n) == 0:
        return 0
    K = len(n)
    z = st.norm.ppf(1 - (1 - confidence) / 2)
    N = sum(n)
    first_part = 0.0
    second_part = 0.0
    for k, n_k in enumerate(n):
        first_part += (k + 1) * (n[k] + 1) / (N + K)
        second_part += (k + 1) * (k + 1) * (n[k] + 1) / (N + K)
    score = first_part - z * math.sqrt((second_part - first_part * first_part) / (N + K + 1))
    return score

# df["bar_score"] = df.apply(lambda x: bayesian_average_rating(x[["one", "two", "three", "four", "five",
#                                                                 "six", "seven", "eight", "nine", "ten"]]), axis=1)
# df.sort_values("bar_score", ascending=False).head(20)

# Sorting Reviews (Wilson Lower Bound)

In [21]:
def wilson_lower_bound(up, down, confidence=0.95):
    import math
    import scipy.stats as st
    """
    Wilson Lower Bound Score hesapla

    - Bernoulli parametresi p için hesaplanacak güven aralığının alt sınırı WLB skoru olarak kabul edilir.
    - Hesaplanacak skor ürün sıralaması için kullanılır.
    - Not:
    Eğer skorlar 1-5 arasıdaysa 1-3 negatif, 4-5 pozitif olarak işaretlenir ve bernoulli'ye uygun hale getirilebilir.
    Bu beraberinde bazı problemleri de getirir. Bu sebeple bayesian average rating yapmak gerekir.

    Parameters
    ----------
    up: int
        up count
    down: int
        down count
    confidence: float
        confidence

    Returns
    -------
    wilson score: float

    """
    n = up + down
    if n == 0:
        return 0
    z = st.norm.ppf(1 - (1 - confidence) / 2)
    phat = 1.0 * up / n
    return (phat + z * z / (2 * n) - z * math.sqrt((phat * (1 - phat) + z * z / (4 * n)) / n)) / (1 + z * z / n)


# wilson_lower_bound
# comments["wilson_lower_bound"] = comments.apply(lambda x: wilson_lower_bound(x["up"], x["down"]), axis=1) çağırılma şekli

# A/B Testing
* 1. Hipotezleri Kur
* 2. Varsayım Kontrolü
*  - 1. Normallik Varsayımı
*  - 2. Varyans Homojenliği
* Hipotezin Uygulanması
* - 1. Varsayımlar sağlanıyorsa bağımsız iki örneklem t testi (parametrik test)
* - 2. Varsayımlar sağlanmıyorsa mannwhitneyu testi (non-parametrik test)
* 4. p-value değerine göre sonuçları yorumla
Not:
* Normallik sağlanmıyorsa direk 2 numara. Varyans homojenliği sağlanmıyorsa 1 numaraya arguman girilir.
* Normallik incelemesi öncesi aykırı değer incelemesi ve düzeltmesi yapmak faydalı olabilir.


*  p-value < ise 0.05 'ten HO RED.
*  p-value < değilse 0.05 H0 REDDEDILEMEZ.


In [22]:
# A/B Testing Function - Quick Solution
# ab["version"] = np.where(ab.version == "gate_30", "A", "B")

def AB_Test(dataframe, group, target):
    
    # Packages
    from scipy.stats import shapiro
    import scipy.stats as stats
    
    # Split A/B
    groupA = dataframe[dataframe[group] == "A"][target]
    groupB = dataframe[dataframe[group] == "B"][target]
    
    # Assumption: Normality
    ntA = shapiro(groupA)[1] < 0.05
    ntB = shapiro(groupB)[1] < 0.05
    # H0: Distribution is Normal! - False
    # H1: Distribution is not Normal! - True
    
    if (ntA == False) & (ntB == False): # "H0: Normal Distribution"
        # Parametric Test
        # Assumption: Homogeneity of variances
        leveneTest = stats.levene(groupA, groupB)[1] < 0.05
        # H0: Homogeneity: False
        # H1: Heterogeneous: True
        
        if leveneTest == False:
            # Homogeneity
            ttest = stats.ttest_ind(groupA, groupB, equal_var=True)[1]
            # H0: M1 == M2 - False
            # H1: M1 != M2 - True
        else:
            # Heterogeneous
            ttest = stats.ttest_ind(groupA, groupB, equal_var=False)[1]
            # H0: M1 == M2 - False
            # H1: M1 != M2 - True
    else:
        # Non-Parametric Test
        ttest = stats.mannwhitneyu(groupA, groupB)[1] 
        # H0: M1 == M2 - False
        # H1: M1 != M2 - True
        
    # Result
    temp = pd.DataFrame({
        "AB Hypothesis":[ttest < 0.05], 
        "p-value":[ttest]
    })
    temp["Test Type"] = np.where((ntA == False) & (ntB == False), "Parametric", "Non-Parametric")
    temp["AB Hypothesis"] = np.where(temp["AB Hypothesis"] == False, "Fail to Reject H0", "Reject H0")
    temp["Comment"] = np.where(temp["AB Hypothesis"] == "Fail to Reject H0", "A/B groups are similar!", "A/B groups are not similar!")
    
    # Columns
    if (ntA == False) & (ntB == False):
        temp["Homogeneity"] = np.where(leveneTest == False, "Yes", "No")
        temp = temp[["Test Type", "Homogeneity","AB Hypothesis", "p-value", "Comment"]]
    else:
        temp = temp[["Test Type","AB Hypothesis", "p-value", "Comment"]]
    
    # Print Hypothesis
    print("# A/B Testing Hypothesis")
    print("H0: A == B")
    print("H1: A != B", "\n")
    
    return temp
    
    
    
# Apply A/B Testing
# AB_Test(dataframe=ab, group = "version", target = "sum_gamerounds")

# A/B Testing iki örneklem oran testi

In [23]:
from statsmodels.stats.proportion import proportions_ztest
# df = sns.load_dataset("titanic")
# df.head()

# df.loc[df["sex"] == "female", "survived"].mean()

# df.loc[df["sex"] == "male", "survived"].mean()

# female_succ_count = df.loc[df["sex"] == "female", "survived"].sum()
# male_succ_count = df.loc[df["sex"] == "male", "survived"].sum()

# test_stat, pvalue = proportions_ztest(count=[female_succ_count, male_succ_count],
#                                       nobs=[df.loc[df["sex"] == "female", "survived"].shape[0],
#                                             df.loc[df["sex"] == "male", "survived"].shape[0]])
# print('Test Stat = %.4f, p-value = %.4f' % (test_stat, pvalue))

# ANOVA
Aşağıdaki cellin hepsi seçilerek comment out yaptıktan sonra kodlar incelenebilir.

In [24]:


# # 1. Hipotezleri kur

# # HO: m1 = m2 = m3 = m4
# # Grup ortalamaları arasında fark yoktur.

# # H1: .. fark vardır

# # 2. Varsayım kontrolü

# # Normallik varsayımı
# # Varyans homojenliği varsayımı

# # Varsayım sağlanıyorsa one way anova
# # Varsayım sağlanmıyorsa kruskal

# # H0: Normal dağılım varsayımı sağlanmaktadır.

# for group in list(df["day"].unique()):
#     pvalue = shapiro(df.loc[df["day"] == group, "total_bill"])[1]
#     print(group, 'p-value: %.4f' % pvalue)
# ##### burada herhangi ikiliden biri sağlansaydı ne olacaktı?
# # test_stat, pvalue = shapiro(df.loc[df["Outcome"] == 0, "Age"].dropna())
# # H0: Varyans homojenliği varsayımı sağlanmaktadır.

# test_stat, pvalue = levene(df.loc[df["day"] == "Sun", "total_bill"],
#                            df.loc[df["day"] == "Sat", "total_bill"],
#                            df.loc[df["day"] == "Thur", "total_bill"],
#                            df.loc[df["day"] == "Fri", "total_bill"])
# print('Test Stat = %.4f, p-value = %.4f' % (test_stat, pvalue))


# # 3. Hipotez testi ve p-value yorumu

# # Hiç biri sağlamıyor.
# df.groupby("day").agg({"total_bill": ["mean", "median"]})


# # HO: Grup ortalamaları arasında ist ol anl fark yoktur

# # parametrik anova testi: #bunların alfa değeri nedir
# f_oneway(df.loc[df["day"] == "Thur", "total_bill"],
#          df.loc[df["day"] == "Fri", "total_bill"],
#          df.loc[df["day"] == "Sat", "total_bill"],
#          df.loc[df["day"] == "Sun", "total_bill"])

# # Nonparametrik anova testi:
# kruskal(df.loc[df["day"] == "Thur", "total_bill"],
#         df.loc[df["day"] == "Fri", "total_bill"],
#         df.loc[df["day"] == "Sat", "total_bill"],
#         df.loc[df["day"] == "Sun", "total_bill"])

# from statsmodels.stats.multicomp import MultiComparison
# comparison = MultiComparison(df['total_bill'], df['day'])
# tukey = comparison.tukeyhsd(0.05)
# print(tukey.summary())

# Association Rule Learning Apriori

* 1. Veri Ön İşleme
* 2. ARL Veri Yapısını Hazırlama (Invoice-Product Matrix)
* 3. Birliktelik Kurallarının Çıkarılması
* 4. Çalışmanın Scriptini Hazırlama
* 5. Sepet Aşamasındaki Kullanıcılara Ürün Önerisinde Bulunmak

In [25]:



def outlier_thresholds(dataframe, variable):
    quartile1 = dataframe[variable].quantile(0.01)
    quartile3 = dataframe[variable].quantile(0.99)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

def retail_data_prep(dataframe):
    dataframe.dropna(inplace=True)
    dataframe = dataframe[~dataframe["Invoice"].str.contains("C", na=False)]
    dataframe = dataframe[dataframe["Quantity"] > 0]
    dataframe = dataframe[dataframe["Price"] > 0]
    replace_with_thresholds(dataframe, "Quantity")
    replace_with_thresholds(dataframe, "Price")
    return dataframe


def create_invoice_product_df(dataframe, id=False):
    if id:
        return dataframe.groupby(['Invoice', "StockCode"])['Quantity'].sum().unstack().fillna(0). \
            applymap(lambda x: 1 if x > 0 else 0)
    else:
        return dataframe.groupby(['Invoice', 'Description'])['Quantity'].sum().unstack().fillna(0). \
            applymap(lambda x: 1 if x > 0 else 0)


def check_id(dataframe, stock_code):
    product_name = dataframe[dataframe["StockCode"] == stock_code][["Description"]].values[0].tolist()
    print(product_name)


def create_rules(dataframe, id=True, country="France"):
    dataframe = dataframe[dataframe['Country'] == country]
    dataframe = create_invoice_product_df(dataframe, id)
    frequent_itemsets = apriori(dataframe, min_support=0.01, use_colnames=True)
    rules = association_rules(frequent_itemsets, metric="support", min_threshold=0.01)
    return rules
def arl_recommender(rules_df, product_id, rec_count=1):
    sorted_rules = rules_df.sort_values("lift", ascending=False)
    recommendation_list = []
    for i, product in enumerate(sorted_rules["antecedents"]):
        for j in list(product):
            if j == product_id:
                recommendation_list.append(list(sorted_rules.iloc[i]["consequents"])[0])

    return recommendation_list[0:rec_count]


# arl_recommender(rules, 22492, 3)

# Content Based Recommendation
*  1. TF-IDF Matrisinin Oluşturulması
* 2. Cosine Similarity Matrisinin Oluşturulması
* 3. Benzerliklere Göre Önerilerin Yapılması

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_cosine_sim(dataframe):
    tfidf = TfidfVectorizer(stop_words='english')
    dataframe['overview'] = dataframe['overview'].fillna('')
    tfidf_matrix = tfidf.fit_transform(dataframe['overview'])
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    return cosine_sim
def content_based_recommender(title, cosine_sim, dataframe):
    # index'leri olusturma
    indices = pd.Series(dataframe.index, index=dataframe['title'])
    indices = indices[~indices.index.duplicated(keep='last')]
    # title'ın index'ini yakalama
    movie_index = indices[title]
    # title'a gore benzerlik skorlarını hesapalama
    similarity_scores = pd.DataFrame(cosine_sim[movie_index], columns=["score"])
    # kendisi haric ilk 10 filmi getirme
    movie_indices = similarity_scores.sort_values("score", ascending=False)[1:11].index
    return dataframe['title'].iloc[movie_indices]

# cosine_sim = calculate_cosine_sim(df)
# content_based_recommender('The Dark Knight Rises', cosine_sim, df)

# Item-Based Collaborative Filtering

* Adım 1: Veri Setinin Hazırlanması
* Adım 2: User Movie Df'inin Oluşturulması
* Adım 3: Item-Based Film Önerilerinin Yapılması

In [27]:
def create_user_movie_df():
    import pandas as pd
    movie = pd.read_csv('datasets/movie_lens_dataset/movie.csv')
    rating = pd.read_csv('datasets/movie_lens_dataset/rating.csv')
    df = movie.merge(rating, how="left", on="movieId")
    comment_counts = pd.DataFrame(df["title"].value_counts())
    rare_movies = comment_counts[comment_counts["title"] <= 1000].index
    common_movies = df[~df["title"].isin(rare_movies)]
    user_movie_df = common_movies.pivot_table(index=["userId"], columns=["title"], values="rating")
    return user_movie_df
def item_based_recommender(movie_name, user_movie_df):
    movie_name = user_movie_df[movie_name]
    return user_movie_df.corrwith(movie_name).sort_values(ascending=False).head(10)
def check_film(keyword, user_movie_df):
    return [col for col in user_movie_df.columns if keyword in col]


# user_movie_df = create_user_movie_df()
# item_based_recommender("Matrix, The (1999)", user_movie_df)
# movie_name = pd.Series(user_movie_df.columns).sample(1).values[0]
# item_based_recommender(movie_name, user_movie_df)

# User-Based Collaborative Filtering
* Adım 1: Veri Setinin Hazırlanması
* Adım 2: Öneri Yapılacak Kullanıcının İzlediği Filmlerin Belirlenmesi
* Adım 3: Aynı Filmleri İzleyen Diğer Kullanıcıların Verisine ve Id'lerine Erişmek
* Adım 4: Öneri Yapılacak Kullanıcı ile En Benzer Davranışlı Kullanıcıların Belirlenmesi
* Adım 5: Weighted Average Recommendation Score'un Hesaplanması

In [28]:
def user_based_recommender(random_user, user_movie_df, ratio=60, cor_th=0.65, score=3.5):
    import pandas as pd
    random_user_df = user_movie_df[user_movie_df.index == random_user]
    movies_watched = random_user_df.columns[random_user_df.notna().any()].tolist()
    movies_watched_df = user_movie_df[movies_watched]
    user_movie_count = movies_watched_df.T.notnull().sum()
    user_movie_count = user_movie_count.reset_index()
    user_movie_count.columns = ["userId", "movie_count"]
    perc = len(movies_watched) * ratio / 100
    users_same_movies = user_movie_count[user_movie_count["movie_count"] > perc]["userId"]

    final_df = pd.concat([movies_watched_df[movies_watched_df.index.isin(users_same_movies)],
                          random_user_df[movies_watched]])

    corr_df = final_df.T.corr().unstack().sort_values().drop_duplicates()
    corr_df = pd.DataFrame(corr_df, columns=["corr"])
    corr_df.index.names = ['user_id_1', 'user_id_2']
    corr_df = corr_df.reset_index()

    top_users = corr_df[(corr_df["user_id_1"] == random_user) & (corr_df["corr"] >= cor_th)][
        ["user_id_2", "corr"]].reset_index(drop=True)

    top_users = top_users.sort_values(by='corr', ascending=False)
    top_users.rename(columns={"user_id_2": "userId"}, inplace=True)
    rating = pd.read_csv('datasets/movie_lens_dataset/rating.csv')
    top_users_ratings = top_users.merge(rating[["userId", "movieId", "rating"]], how='inner')
    top_users_ratings['weighted_rating'] = top_users_ratings['corr'] * top_users_ratings['rating']

    recommendation_df = top_users_ratings.groupby('movieId').agg({"weighted_rating": "mean"})
    recommendation_df = recommendation_df.reset_index()

    movies_to_be_recommend = recommendation_df[recommendation_df["weighted_rating"] > score].sort_values("weighted_rating", ascending=False)
    movie = pd.read_csv('datasets/movie_lens_dataset/movie.csv')
    return movies_to_be_recommend.merge(movie[["movieId", "title"]])



# random_user = int(pd.Series(user_movie_df.index).sample(1).values)
# user_based_recommender(random_user, user_movie_df, cor_th=0.70, score=4)

# Model-Based Collaborative Filtering: Matrix Factorization

* Adım 1: Veri Setinin Hazırlanması
* Adım 2: Modelleme
* Adım 3: Model Tuning
* Adım 4: Final Model ve Tahmin

Aşağıdaki kodu okumak için önce tüm hücrenin "comment out"unu kaldırın

In [29]:
# from surprise.model_selection import GridSearchCV, train_test_split, cross_validate
# from surprise import Reader, SVD, Dataset, accuracy


# movie = pd.read_csv('datasets/movie_lens_dataset/movie.csv')
# rating = pd.read_csv('datasets/movie_lens_dataset/rating.csv')
# df = movie.merge(rating, how="left", on="movieId")
# df.head()
# movie.head()
# rating.head()
# movie_ids = [130219, 356, 4422, 541]
# movies = ["The Dark Knight (2011)",
#           "Cries and Whispers (Viskningar och rop) (1972)",
#           "Forrest Gump (1994)",
#           "Blade Runner (1982)"]

# sample_df = df[df.movieId.isin(movie_ids)]
# sample_df.head()

# sample_df.shape

# user_movie_df = sample_df.pivot_table(index=["userId"],
#                                       columns=["title"],
#                                       values="rating")

# user_movie_df.shape

# reader = Reader(rating_scale=(1, 5))

# data = Dataset.load_from_df(sample_df[['userId',
#                                        'movieId',
#                                        'rating']], reader)

# ##############################
# # Adım 2: Modelleme
# ##############################

# trainset, testset = train_test_split(data, test_size=.25)
# svd_model = SVD()
# svd_model.fit(trainset)
# predictions = svd_model.test(testset)

# accuracy.rmse(predictions)


# svd_model.predict(uid=1.0, iid=541, verbose=True)

# svd_model.predict(uid=1.0, iid=356, verbose=True)


# sample_df[sample_df["userId"] == 1]

# ##############################
# # Adım 3: Model Tuning
# ##############################

# param_grid = {'n_epochs': [5, 10, 20],
#               'lr_all': [0.002, 0.005, 0.007]}


# gs = GridSearchCV(SVD,
#                   param_grid,
#                   measures=['rmse', 'mae'],
#                   cv=3,
#                   n_jobs=-1,
#                   joblib_verbose=True)

# gs.fit(data)

# gs.best_score['rmse']
# gs.best_params['rmse']


# ##############################
# # Adım 4: Final Model ve Tahmin
# ##############################

# dir(svd_model)
# svd_model.n_epochs

# svd_model = SVD(**gs.best_params['rmse'])

# data = data.build_full_trainset()
# svd_model.fit(data)

# svd_model.predict(uid=1.0, iid=541, verbose=True)