# Customer Analysis For Retail

## BUSINESS PROBLEM:
### A Retail store is required to analyze the day-to-day transactions and keep a track of its customers spread across various locations along with their purchases/returns across various categories.

### Create a report and display the below calculated metrics, reports and inferences.


## Import necessary libraries

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

# set the graphs to show in the jupyter notebook
%matplotlib inline

# set seabor graphs to a better style
sns.set(style="ticks")

In [None]:

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
# Uploading datasets i.e "Customers","Transaction","Product Heirarchy"
Customers = pd.read_csv("/kaggle/input/retail-case-study-data/Customer.csv")
Transaction = pd.read_csv("/kaggle/input/retail-case-study-data/Transactions.csv")
Product_heirarchy = pd.read_csv("/kaggle/input/retail-case-study-data/prod_cat_info.csv")

In [None]:
Customers.shape

In [None]:
Customers.head(10)

In [None]:
Transaction.shape

In [None]:
Transaction.head()

In [None]:
Product_heirarchy.shape

In [None]:
Product_heirarchy.head()

In [None]:
#Renaming column "prod_sub_cat_code" to "prod_subcat_code" so that datasets merged easily Transaction and Product_heirarchy
Product_heirarchy.rename(columns = {"prod_sub_cat_code":"prod_subcat_code"},inplace = True)

### 1. Merge the datasets Customers, Product Hierarchy and Transactions as Customer_Final. Ensure to keep all customers who have done transactions with us and select the join type accordingly.

In [None]:
#Merging Datasets transaction and product_heirarchy with the help of left join and assigning it to variable Data
Data= pd.merge(left = Transaction,
        right = Product_heirarchy,
        on = ["prod_cat_code","prod_subcat_code"],
        how = "left",
        )
Data

In [None]:
Data.isnull().sum()

In [None]:
#Merging Datasets Data and Customers and assigning it to a final variable called Customer_Final
Customer_Final = pd.merge(left = Data,
                         right = Customers,
                         left_on = "cust_id",
                         right_on = "customer_Id",
                         how = "left")
Customer_Final

In [None]:
Customer_Final.head()

In [None]:
Customer_Final.shape

In [None]:
Transaction.shape

In [None]:
print('''Rows of both "Customer_Final" & "Transaction" are same. That means all transactions done are present
in Customers_Final
'''
)

In [None]:
Customer_Final.dtypes

In [None]:
Customer_Final.isnull().sum()

In [None]:
#Converting "DOB" and "Tran_date" dtype from object to dates
Customer_Final["DOB"] = pd.to_datetime(Customer_Final["DOB"],format = "%d-%m-%Y")
Customer_Final["tran_date"] = pd.to_datetime(Customer_Final["tran_date"])

In [None]:
Customer_Final["DOB"].head()

In [None]:
Customer_Final["tran_date"].head()

In [None]:
#Checking For Duplicates
Customer_Final.duplicated().sum()

In [None]:
#dropping duplicates
Customer_Final.drop_duplicates(inplace=True)

In [None]:
Customer_Final.duplicated().sum()

### 2. Prepare a summary report for the merged data set.

### a. Get the column names and their corresponding data types 

In [None]:
#Column names and corresponding Datatypes
Customer_Final.columns

In [None]:
#Column names and corresponding Datatypes
Customer_Final.dtypes

### b. Top/Bottom 10 observations

In [None]:
#Top 10 observations
Customer_Final.head(10)

In [None]:
#Bottom ten observations
Customer_Final.tail(10)

### c. “Five-number summary” for continuous variables (min, Q1, median, Q3 and max)

In [None]:
#using Customer_Final.describe() to describe the data where we can see count,mean,std,min,25%,50%,75%,max for continuous variables present in the data
Customer_Final.describe()

In [None]:
#using quantile function to describe 0 = min , 0.25 = Q1 , 0.5 = Q2, 0.5 = median , 0.75 = Q3 , 1 = max
quant = Customer_Final.quantile([0, 0.25, 0.5, 0.75, 1])
Q1 = quant.loc[0.25]
Q3 = quant.loc[0.75]
Min = quant.loc[0]
Max = quant.loc[1]
Median = quant.loc[0.5]

### d. Frequency tables for all the categorical variables

In [None]:
Frequency_tables = Customer_Final.loc[:,Customer_Final.dtypes == "object"].describe()
Frequency_tables

### 3. Generate histograms for all continuous variables and frequency bars for categorical variables.

#### Histograms for continuous variables

In [None]:
Continuos_variable = Customer_Final.loc[:,["prod_subcat_code","prod_cat_code","Qty","Rate","Tax","total_amt"]]

In [None]:
Continuos_variable.columns

In [None]:
for var in  Continuos_variable.columns:
    Continuos_variable[var].plot(kind="hist")
    plt.title(var)
    plt.show()

#### Bar chart for Categorical Variables

In [None]:
Categorical_variables = Customer_Final.loc[:,Customer_Final.dtypes == "object"]
Categorical_variables

In [None]:
plt.figure(figsize=(8,8))
sns.countplot(Categorical_variables["Gender"])
plt.show()

In [None]:
plt.figure(figsize=(8,8))
sns.countplot(Categorical_variables["Store_type"])
plt.xlabel("Store Type")
plt.show()

In [None]:
plt.figure(figsize=(8,8))
sns.countplot(Categorical_variables["prod_cat"])
plt.xlabel("Product Category")
plt.show()

In [None]:
plt.figure(figsize=(8,8))
Categorical_variables.groupby("prod_subcat")["prod_subcat"].count().plot(kind = "barh")
plt.xlabel("Count")
plt.ylabel("Product sub-category")
plt.show()

### 4. Calculate the following information using the merged dataset :

#### a. Time period of the available transaction data

In [None]:
Customer_Final.sort_values(by="tran_date")

In [None]:
Start_date = Customer_Final["tran_date"].min()

In [None]:
End_date = Customer_Final["tran_date"].max()

In [None]:
print("Time period of the available transaction data is from " + pd.Timestamp.strftime(Start_date,format = "%d-%m-%Y") + " to " + pd.Timestamp.strftime(End_date,format = "%d-%m-%Y"))

#### b. Count of transactions where the total amount of transaction was negative

In [None]:
#Count of transactions where the total amount of transaction was negative
negative_transaction = Customer_Final.loc[Customer_Final["total_amt"] < 0 , "transaction_id"].count()

In [None]:
print("Count of transactions where the total amount of transaction was negative is" , negative_transaction)

### 5. Analyze which product categories are more popular among females vs male customers

In [None]:
#Groupby the dataset on the basis of "Gender" and "prod_cat"
Popular_products = Customer_Final.groupby(["Gender","prod_cat"])[["Qty"]].sum().reset_index()
Popular_products.pivot(index = "Gender",columns = "prod_cat",values = "Qty")

#### Products popular in Males are :
- Books
- Clothing
- Electronics
- Home and Kitchen
 
#### Products popular in Females are :
- Bags
- Footwear


### 6. Which City code has the maximum customers and what was the percentage of customers from that city?

In [None]:
Cust_Grp = Customer_Final.groupby(["city_code"])["customer_Id"].count().sort_values(ascending = False)
Cust_Grp

In [None]:
plt.figure(figsize = (8,5))
Cust_Grp.plot(kind = "bar")
plt.xlabel("City Code")
plt.ylabel("No. of Customers")
plt.yticks(np.arange(0 , 3500, step = 500))
plt.show()

In [None]:
Percentage = round((Cust_Grp[4.0]/Cust_Grp.sum()) * 100,2)
Percentage

In [None]:
print("City code 4.0 has the maximum customers and the percentage of customers from the city is ",Percentage)

### 7. Which store type sells the maximum products by value and by quantity?

In [None]:
Customer_Final.groupby(["Store_type"])["Qty","Rate"].sum().sort_values(by="Qty",ascending = False)

In [None]:
print("e-Shop sells the maximum products by value and by quantity")

### 8. What was the total amount earned from the "Electronics" and "Clothing" categories from Flagship Stores?


In [None]:
Store_group = round(Customer_Final.pivot_table(index = "prod_cat", columns = "Store_type", values = "total_amt",  aggfunc = "sum"),2)
Store_group

In [None]:
#the total amount earned from the "Electronics" and "Clothing" categories from Flagship Stores
Total_amt = Store_group.loc[["Clothing","Electronics"],"Flagship store"].sum()
print("the total amount earned from the Electronics and Clothing categories from Flagship Stores is ", Total_amt)

### 9. What was the total amount earned from "Male" customers under the "Electronics" category?


In [None]:
Gender_group = round(Customer_Final.pivot_table(index = "prod_cat", columns = "Gender", values = "total_amt",  aggfunc = "sum"),2)
Gender_group

In [None]:
Male_amt = Gender_group.loc["Electronics","M"].sum()
Male_amt

In [None]:
print("the total amount earned from Males the Electronics category is ", Male_amt)

### 10. How many customers have more than 10 unique transactions, after removing all transactions which have any negative amounts?

In [None]:
#Creating a Datafram that does not have negative Transactions of customers
Pos_Trans= Customer_Final.loc[Customer_Final["total_amt"]>0,:]
Pos_Trans

In [None]:
# Creating a dataframe that contains unique possitive transactions
Unique_Trans = Pos_Trans.groupby(["customer_Id","prod_cat","prod_subcat"])["transaction_id"].count().reset_index()
Unique_Trans

In [None]:
# Now finding the customers having unique transactions greater than 10
Unique_trans_count = Unique_Trans.groupby("customer_Id")["transaction_id"].count().reset_index()
Unique_trans_count

In [None]:
Unique_trans_count[Unique_trans_count["transaction_id"]>10]

In [None]:
print("There are no unique transactions greater than 10")

### 11. For all customers aged between 25 - 35, find out:


#### a. What was the total amount spent for “Electronics” and “Books” product categories?

In [None]:
now = pd.Timestamp('now') 
Customer_Final['DOB'] = pd.to_datetime(Customer_Final['DOB'], format='%m%d%y') # 1 
Customer_Final['DOB'] = Customer_Final['DOB'].where(Customer_Final['DOB'] < now, Customer_Final['DOB'] - np.timedelta64(100, 'Y')) # 2 
Customer_Final['AGE'] = (now - Customer_Final['DOB']).astype('<m8[Y]')

#### as we have to deal with customers aged between 25-35, so creating new column 'Age_cat'

In [None]:
Customer_Final['Age_cat'] = pd.cut(Customer_Final['AGE'],bins=[24,35,46,57],labels=['25-35','36-46','47-57'],include_lowest=True)

In [None]:
Customer_Final

In [None]:
# grouping the dataframe 'customer_final' on the basis of 'Age_cat' and 'prod_cat' 
Customer_25_35 = Customer_Final.groupby(['Age_cat','prod_cat'])['total_amt'].sum()
Customer_25_35

In [None]:
Customer_25_35.loc["25-35" , ["Books" , "Electronics"]]

In [None]:
print("Total amount spent on 'Electronics' and 'Books' product categories is", 
      Customer_25_35.loc['25-35',['Books','Electronics']].sum().round(2))

### b. What was the total amount spent by these customers between 1st Jan, 2014 to 1st Mar, 2014?

In [None]:
Customer_Final

In [None]:
# filtering out data that belongs to the 'age_cat' = 25-35 
Customer_total_amount_25_35 = Customer_Final[Customer_Final['Age_cat']=='25-35']
Customer_total_amount_25_35

In [None]:
# getting all the data with transaction date between 1st Jan 2014 to 1st Mar 2014? 
total_amount = Customer_total_amount_25_35[(Customer_total_amount_25_35['tran_date'] >='2014-01-01') & (Customer_total_amount_25_35['tran_date'] <='2014-03-01')]
total_amount

In [None]:
print('The total amount spent by customers aged 25-35 between 1st Jan 2014 to 1st Mar 2014 is', total_amount['total_amt'].sum())