In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from mlxtend.frequent_patterns import apriori, association_rules
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_excel("../input/uci-online-retail-ii-data-set/online_retail_II.xlsx", sheet_name = "Year 2010-2011")

In [None]:
data.head()

In [None]:
# iadeleri sildik
"""
iadeler = []
for i,j in enumerate(df["Invoice"].values):
    if str(j).startswith("C"):
        iadeler.append(i)
    

df.drop(iadeler, inplace=True) 
"""
data=data[~data["Invoice"].astype(str).str.startswith("C")]

In [None]:
data.isna().sum()

In [None]:
ba = data.copy()

In [None]:
ba.dropna(subset = ['Description'], inplace=True) # nanları attık.

In [None]:
ba.isna().sum()

In [None]:
ba.head()

In [None]:
ba.shape

In [None]:
ba["Description"] = ba["Description"].astype(str).apply(lambda x: x.strip())

In [None]:
ba=ba[~ba["Description"].astype(str).str.startswith("wrong")]

In [None]:
ba.shape

In [None]:
# işimize yaramayan column'ları attık.
ba.drop(columns=['StockCode', "InvoiceDate", "Price", "Country", "Customer ID"], inplace=True)

In [None]:
ba['Description'].tail()

In [None]:
ba.head()

In [None]:
ba.groupby(['Invoice','Description'])['Description'].count()

In [None]:
# Invoice ve Description'a göre gruplayıp Quantityye göre toplayıp unstack yapıyor
branch_order = (ba
          .groupby(['Invoice', 'Description'])['Quantity'] 
          .sum().unstack().reset_index().fillna(0) 
          .set_index('Invoice'))

In [None]:
branch_order.head()

In [None]:
encoded = branch_order.applymap(lambda x: 1 if x != 0 else 0) # 

In [None]:
encoded.head()

In [None]:
freq_items = apriori(encoded, min_support=0.04, use_colnames=True, verbose=True)

In [None]:
freq_items

In [None]:
freq_items.sort_values('support', ascending=False)

In [None]:
association_rules(freq_items, metric = 'confidence', min_threshold=0.4).sort_values(['support','confidence'], ascending=[False,False])


* JUMBO BAG PINK POLKADOT ve JUMBO BAG RED RETROSPOT tüm alışverişlerin %4'ünde birlikte bulunuyor. (support)
* JUMBO BAG PINK POLKADOT alanların %67'si JUMBO BAG RED RETROSPOT de alıyor. (confidence)
* JUMBO BAG PINK POLKADOT alımı JUMBO BAG RED RETROSPOT alımını 6.6 kat artırıyor. (lift)

# RFM

In [None]:
rfm = data.copy()

In [None]:
rfm.head()

In [None]:
rfm.dropna(inplace=True)

In [None]:
rfm.drop(columns=['StockCode', 'Country'], inplace=True) # işimize yaramayan columnları attık.

In [None]:
rfm["Customer ID"] = rfm["Customer ID"].astype(int) # CustomerID'yi integer'a çevirdik. Çünkü çirkin duruyordu.

In [None]:
rfm.head()

In [None]:
# Fatura başına ortalama ne kadar kazanılmıştır? 
rfm['Total'] = rfm["Quantity"] * rfm['Price']

In [None]:
rfm.head()

In [None]:
# Fatura başı toplam kazanç
rfm.groupby('Invoice').agg({'Total':'sum'}) 

In [None]:
# Aykırı değerler var mı? Varsa kaç tane
for feature in ["Quantity","Price","Total"]:

    Q1 = rfm[feature].quantile(0.01)
    Q3 = rfm[feature].quantile(0.99)
    IQR = Q3-Q1
    upper = Q3 + 1.5*IQR
    lower = Q1 - 1.5*IQR

    if rfm[(rfm[feature] > upper) | (rfm[feature] < lower)].any(axis=None):
        print(feature,"yes")
        print(rfm[(rfm[feature] > upper) | (rfm[feature] < lower)].shape[0])
    else:
        print(feature, "no")

### Recency

Recency (yenilik): Müşterinin son satın almasından bugüne kadar geçen süre

-- Bugünün tarihi - Son satın alma


In [None]:
rfm['InvoiceDate'].min() # ilk tarih

In [None]:
rfm['InvoiceDate'].max() # son tarih

In [None]:
today_date = dt.datetime(2011, 12 ,9) #bugünün tarihi

In [None]:
# Müşteriler en son ne zaman alışveriş yaptı?
rfm.groupby("Customer ID").agg({"InvoiceDate":"max"}).head()

In [None]:
# Bugünden itibaren kaç gün önce alışveriş yapıldı?
(today_date - rfm.groupby("Customer ID").agg({"InvoiceDate":"max"})).head() 

In [None]:
temp_df = (today_date - rfm.groupby("Customer ID").agg({"InvoiceDate":"max"}))
temp_df.rename(columns={"InvoiceDate": "Recency"}, inplace = True)

In [None]:
temp_df.head()

In [None]:
recency_df = temp_df["Recency"].apply(lambda x: x.days) # Günleri aldık.

In [None]:
recency_df.head()

### Frequency

Frequency (Sıklık): Toplam satın alma sayısı.

In [None]:
temp_df = rfm.groupby(["Customer ID","Invoice"]).agg({"Invoice":"count"})

In [None]:
temp_df.head()

In [None]:
# Her müşterinin kaç faturası var?
temp_df.groupby("Customer ID").agg({"Invoice":"count"}).head()

In [None]:
freq_df = temp_df.groupby("Customer ID").agg({"Invoice":"sum"})
freq_df.rename(columns={"Invoice": "Frequency"}, inplace = True)
freq_df.head()

### Monetary

Monetary (Parasal Değer): Müşterinin yaptığı toplam harcama.

In [None]:
monetary_df = rfm.groupby("Customer ID").agg({"Total":"sum"})

In [None]:
monetary_df.head()

In [None]:
monetary_df.rename(columns={"Total": "Monetary"}, inplace = True)

In [None]:
print(recency_df.shape,freq_df.shape,monetary_df.shape)

In [None]:
# rfm adında yeni bir DataFrame oluşturup recency, frequency ve monetary'yi birleştirdik.
rfm = pd.concat([recency_df, freq_df, monetary_df],  axis=1) 

In [None]:
rfm.head()

In [None]:
# Recency : En yakın tarihten en uzak tarihe göre 5'ten 1'e skorladık.
# Frequency : Sıklığa göre 1'den 5'e göre skorladık
# Monetary : Müşteriden kazanılan toplam paraya göre skorladık.
rfm["RecencyScore"] = pd.qcut(rfm['Recency'], 5, labels = [5, 4, 3, 2, 1])
rfm["FrequencyScore"] = pd.qcut(rfm['Frequency'], 5, labels = [1, 2, 3, 4, 5])
rfm["MonetaryScore"] = pd.qcut(rfm['Monetary'], 5, labels = [1, 2, 3, 4, 5])

In [None]:
rfm.head()

In [None]:
(rfm['RecencyScore'].astype(str) + 
 rfm['FrequencyScore'].astype(str) + 
 rfm['MonetaryScore'].astype(str)).head()

In [None]:
# Müşteri segmentlerini belirledik/tanımladık. 
seg_map = {
    r'[1-2][1-2]': 'Hibernating',
    r'[1-2][3-4]': 'At Risk',
    r'[1-2]5': 'Can\'t Loose',
    r'3[1-2]': 'About to Sleep',
    r'33': 'Need Attention',
    r'[3-4][4-5]': 'Loyal Customers',
    r'41': 'Promising',
    r'51': 'New Customers',
    r'[4-5][2-3]': 'Potential Loyalists',
    r'5[4-5]': 'Champions'
}

In [None]:
# Regex'e göre Her müşteriyi segmentlere ayırdık.
rfm['Segment'] = rfm['RecencyScore'].astype(str) + rfm['FrequencyScore'].astype(str)
rfm['Segment'] = rfm['Segment'].replace(seg_map, regex=True)
rfm.head()

In [None]:
rfm[["Segment", "Recency","Frequency","Monetary"]].groupby("Segment").agg(["mean","count"])

In [None]:
rfm[rfm["Segment"] == "Need Attention"].head()


In [None]:
need_att = pd.DataFrame()
need_att['Need Attention Customer ID'] = rfm[rfm['Segment'] == 'Need Attention'].index

In [None]:
need_att.to_csv('need_att.csv') # csv'ye çevirme