# Business Problem 

An e-commerce company wants to segments customers and develop marketing strategies. 

Data Set Information:

This Online Retail II data set contains all the transactions occurring for a UK-based and registered, non-store online retail between 01/12/2009 and 09/12/2011.
The company mainly sells unique all-occasion gift-ware.
Many customers of the company are wholesalers.

Attribute Information:

InvoiceNo: Invoice number. Nominal. A 6-digit integral number uniquely assigned to each transaction. If this code starts with the letter 'c', it indicates a cancellation.
StockCode: Product (item) code. Nominal. A 5-digit integral number uniquely assigned to each distinct product.
Description: Product (item) name. Nominal.
Quantity: The quantities of each product (item) per transaction. Numeric.
InvoiceDate: Invice date and time. Numeric. The day and time when a transaction was generated.
UnitPrice: Unit price. Numeric. Product price per unit in sterling (Â£).
CustomerID: Customer number. Nominal. A 5-digit integral number uniquely assigned to each customer.
Country: Country name. Nominal. The name of the country where a customer resides.

# Data Understanding 

In [None]:
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df=pd.read_csv("../input/online-retail-ii-uci/online_retail_II.csv")
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df[df["Customer ID"].isnull()]

# Data Preparation

In [None]:
df.dropna(subset=["Customer ID"],inplace=True)
df.shape

In [None]:
df["Customer ID"]=df["Customer ID"].astype(int)

In [None]:
df[df["Price"]<0]

In [None]:
df.sort_values("StockCode")

In [None]:
df[df["StockCode"].str.contains("TEST",regex=False)]

In [None]:
test_account_id=df[df["StockCode"].str.contains("TEST",regex=False)].iloc[:,6].astype("int")
test_account_id

In [None]:
test_account_id_list=[]
for i in range(len(test_account_id)):
    test_account_id_list.append(test_account_id[i:i+1].item())
    
test_account_id_set=set(test_account_id_list)
test_account_id_list=list(test_account_id_set)
test_account_id_list

In [None]:
for i in test_account_id_list:
    delete_invoice=df[df["Customer ID"]==i].index
    df.drop(delete_invoice,inplace=True)

In [None]:
df[df["StockCode"].str.contains("TEST",regex=False)]

In [None]:
order_cancel=df[df["Invoice"].str.contains("C",regex=False)].index
order_cancel

In [None]:
df.drop(order_cancel,inplace=True)

In [None]:
df.groupby("Invoice").agg({"Price":"sum"})

In [None]:
df.shape

In [None]:
df.sort_values("StockCode")

In [None]:
df[df["Description"]=="POSTAGE"]

In [None]:
post=df[df["StockCode"].str.contains("POST",regex=False)].index
post

In [None]:
df.drop(post,inplace=True)

In [None]:
df["TotalPrice"] = df["Quantity"]*df["Price"]

# RFM Skorları ile Müşteri Segmentasyonu

# Recency

In [None]:
df["InvoiceDate"].max()

In [None]:
import datetime as dt
today_date = dt.datetime(2011,12,9,21)
today_date

In [None]:
type(df["InvoiceDate"][0]) 

In [None]:
df["InvoiceDate"]=pd.to_datetime(df["InvoiceDate"])

In [None]:
df.groupby("Customer ID").agg({"InvoiceDate":"max"}).head()

In [None]:
(today_date - df.groupby("Customer ID").agg({"InvoiceDate":"max"})).head()

In [None]:
recency_df=df.groupby("Customer ID").agg({"InvoiceDate": lambda x: (today_date - x.max()).days})
recency_df.rename(columns={"InvoiceDate": "Recency"}, inplace = True)
recency_df.head()

# Frequency

In [None]:
temp_df = df.groupby(["Customer ID","Invoice"]).agg({"Invoice":"count"})
temp_df.head()

In [None]:
freq_df = temp_df.groupby("Customer ID").agg({"Invoice":"sum"})
freq_df.rename(columns={"Invoice": "Frequency"}, inplace = True)
freq_df.head()

# Monetary

In [None]:
monetary_df = df.groupby("Customer ID").agg({"TotalPrice":"sum"})
monetary_df.rename(columns={"TotalPrice": "Monetary"}, inplace = True)
monetary_df.head()

In [None]:
print(recency_df.shape,freq_df.shape,monetary_df.shape)

In [None]:
rfm = pd.concat([recency_df, freq_df, monetary_df],  axis=1)
rfm.head()

In [None]:
rfm["RecencyScore"] = pd.qcut(rfm['Recency'], 5, labels = [5, 4, 3, 2, 1])
rfm["FrequencyScore"] = pd.qcut(rfm['Frequency'], 5, labels = [1, 2, 3, 4, 5])
rfm["MonetaryScore"] = pd.qcut(rfm['Monetary'], 5, labels = [1, 2, 3, 4, 5])
rfm["RFM_Score"]= rfm['RecencyScore'].astype(str) +  rfm['FrequencyScore'].astype(str) +  rfm['MonetaryScore'].astype(str)
rfm.head()

In [None]:
rfm.describe().T

In [None]:
seg_map = {
    r'[1-2][1-2]': 'Hibernating',
    r'[1-2][3-4]': 'At Risk',
    r'[1-2]5': 'Can\'t Loose',
    r'3[1-2]': 'About to Sleep',
    r'33': 'Need Attention',
    r'[3-4][4-5]': 'Loyal Customers',
    r'41': 'Promising',
    r'51': 'New Customers',
    r'[4-5][2-3]': 'Potential Loyalists',
    r'5[4-5]': 'Champions'
}

In [None]:
rfm['Segment'] = rfm['RecencyScore'].astype(str) + rfm['FrequencyScore'].astype(str)
rfm['Segment'] = rfm['Segment'].replace(seg_map, regex=True)
rfm.head()

In [None]:
rfm[["Segment", "Recency","Frequency","Monetary"]].groupby("Segment").agg(["mean","count"])

In [None]:
champions=pd.DataFrame()
champions["Customer_ID"]=rfm[rfm["Segment"]=="Champions"].index
champions.head()

In [None]:
champions.to_csv("champions.csv")

In [None]:
import squarify
import matplotlib.pyplot as plt
import seaborn as sns
rfm=rfm.reset_index()
sq1=rfm.groupby("Segment")["Customer ID"].nunique().sort_values(ascending=False).reset_index()
plt.figure(figsize=(14,8))
sq1.drop([0],inplace=True)
sns.barplot(data=sq1,x="Segment",y="Customer ID",palette="Greens_d");