<h1 align='center'>Customer Analysis For Retail</h1>

# Business Problem

<b>A Retail store is required to analyze the day-to-day transactions and keep a track of its customers spread across various locations along with their purchases/returns across various categories.</b><br><br>
<b>Create a report and display the below calculated metrics, reports and inferences.</b>

# Import necessary libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import os

# Import the data set

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
customer = pd.read_csv("/kaggle/input/Customer.csv")
prod_info = pd.read_csv("/kaggle/input/prod_cat_info.csv")
transaction = pd.read_csv("/kaggle/input/Transactions.csv")

# Exploratory Data Analysis

In [None]:
customer.shape

In [None]:
prod_info.shape

In [None]:
transaction.shape

In [None]:
customer.head(2)

In [None]:
prod_info.head(2)

In [None]:
# renaming "prod_sub_cat_code" column in 'prod_info' table to make it similar to 'transaction' table
# to merge the both the tables easily
prod_info.rename(columns={"prod_sub_cat_code":"prod_subcat_code"},inplace=True)

In [None]:
transaction.head()

# 1. Merge the datasets Customers, Product Hierarchy and Transactions as Customer_Final

### Merge 'transaction' and 'prod_info' tables

In [None]:
# merge transaction and prod_info table and create a new table "prod_concat"
prod_concat = pd.merge(left=transaction, right=prod_info,on=["prod_cat_code","prod_subcat_code"],how="left")

In [None]:
prod_concat

In [None]:
prod_concat.isnull().sum()

### Merge 'customer' and 'prod_concat' tables

In [None]:
customer.head()

In [None]:
#merge "prod_concat" and "customer" table and create the final table "customer_final"
customer_final = pd.merge(left=prod_concat, right=customer,right_on="customer_Id", left_on="cust_id", how="left")

In [None]:
customer_final.head()

In [None]:
customer_final.shape

In [None]:
transaction.shape

In [None]:
print('''Rows of both the 'customer_final' and 'transaction' table are same. That means all the transactions done at the 
         Retail Store are present in the final table ''')

In [None]:
customer_final.dtypes

In [None]:
customer_final.isnull().sum()

In [None]:
# converting "DOB" and "tran_date" from object dtype to dates
customer_final["DOB"] = pd.to_datetime(customer_final["DOB"], format="%d-%m-%Y")

In [None]:
customer_final['DOB'].head(10)

In [None]:
customer_final["tran_date"] = pd.to_datetime(customer_final["tran_date"])

In [None]:
customer_final["tran_date"].head(10)

## Checking for duplicate values


In [None]:
customer_final.duplicated().sum()

In [None]:
# dropping duplicate rows
customer_final.drop_duplicates(inplace=True)

In [None]:
customer_final.duplicated().sum()

# 2. Prepare a summary report for the merged data set.

### (a) Get the column names and their corresponding data types

In [None]:
#column names of "customer_final" dataframe
customer_final.columns

In [None]:
# data types of all columns of "customer_final" dataframe
customer_final.dtypes

### (b) Top/Bottom 10 observations


In [None]:
# top 10 observations
customer_final.head(10)

In [None]:
#bottom 10 observations
customer_final.tail(10)

### (c) “Five-number summary” for continuous variables (min, Q1, median, Q3 and max)

In [None]:
customer_final.describe()

### (d) Frequency tables for all the categorical variables

In [None]:
customer_final.loc[:,customer_final.dtypes=="object"].describe()

# (3) Generate histograms for all continuous variables and frequency bars for categorical variables

### Histogram of all continuous variables

In [None]:
conti_customer = customer_final.loc[:,['prod_subcat_code','prod_cat_code', 'Qty', 'Rate', 'Tax', 'total_amt']]

In [None]:
conti_customer.columns

In [None]:
for var in conti_customer.columns:
    conti_customer[var].plot(kind='hist')
    plt.title(var)
    plt.show()

### Bar chart of categorical variables

In [None]:
category_customer = customer_final.loc[:,customer_final.dtypes=='object']

In [None]:
category_customer.head()

In [None]:
plt.figure(figsize=(8,8))
sns.countplot(category_customer['Gender'])
plt.show()

In [None]:
plt.figure(figsize=(8,8))
sns.countplot(category_customer['Store_type'])
plt.xlabel('Store Type')
plt.show()

In [None]:
plt.figure(figsize=(8,8))
sns.countplot(category_customer['prod_cat'])
plt.xlabel('Product Category')
plt.show()

In [None]:
plt.figure(figsize=(8,8))
category_customer.groupby('prod_subcat')['prod_subcat'].count().plot(kind='barh')
plt.xlabel('Count')
plt.ylabel('Product Subcategory')
plt.show()

# (4) Calculate the following information using the merged dataset:

### (a) Time period of the available transaction data

In [None]:
customer_final.sort_values(by="tran_date")

In [None]:
min_date = customer_final["tran_date"].min()

In [None]:
max_date = customer_final["tran_date"].max()

In [None]:
print("Time period of the available transaction data is from "+ pd.Timestamp.strftime(min_date,format="%d-%m-%Y") + " to " + pd.Timestamp.strftime(max_date,format="%d-%m-%Y"))

### (b) Count of transactions where the total amount of transaction was negative

In [None]:
customer_final.head()

In [None]:
#count of transaction_ids where total_amt was negative
negative_transaction = customer_final.loc[customer_final["total_amt"] < 0,"transaction_id"].count()

In [None]:
print("Count of transactions where the total amount of transaction was negative is",negative_transaction)

# (5) Analyze which product categories are more popular among females vs male customers

In [None]:
#groupby the data set on the basis of "Gender" and "prod_cat"
product_gender = customer_final.groupby(["Gender","prod_cat"])[["Qty"]].sum().reset_index()

In [None]:
product_gender

In [None]:
#converting to pivot table for better view
product_gender.pivot(index="Gender",columns="prod_cat",values="Qty")

Products that are popular among males are:
<ul><li>Books</li>
    <li>Clothing</li>
    <li>Electronics</li>
    <li>Home and kitchen</li></ul>

Products that are popular among females are:
<ul><li>Bags</li>
    <li>Footwear</li>
    </ul>

# (6) Which City code has the maximum customers and what was the percentage of customers from that city?

In [None]:
customer_final.head(2)

In [None]:
customer_group = customer_final.groupby('city_code')['customer_Id'].count().sort_values(ascending =False)

In [None]:
customer_group

In [None]:
plt.figure(figsize=(8,5))
customer_group.plot(kind="bar")
plt.xlabel("City Code")
plt.ylabel("No. of customers")
plt.yticks(np.arange(0, 3500, step=500))
plt.show()

In [None]:
percentage = round((customer_group[4.0] / customer_group.sum()) * 100,2)

In [None]:
percentage

In [None]:
print("City code 4.0 has the maximum customers and the percentage of customers from that city is ",percentage)

# (7) Which store type sells the maximum products by value and by quantity?

In [None]:
customer_final.head(2)

In [None]:
customer_final.groupby("Store_type")["Qty","Rate"].sum().sort_values(by="Qty",ascending=False)

In [None]:
print('e-Shop store sell the maximum products by value and by quantity')

# (8) What was the total amount earned from the "Electronics" and "Clothing" categories from Flagship Stores?

In [None]:
store_group = round(customer_final.pivot_table(index = "prod_cat",columns="Store_type", values="total_amt", aggfunc='sum'),2)

In [None]:
store_group

In [None]:
store_group.loc[["Clothing","Electronics"],"Flagship store"]

In [None]:
# if we have to find total amount of both 'Clothing' and 'Electronics' from ' Flagship Store'
store_group.loc[["Clothing","Electronics"],"Flagship store"].sum()

# (9)  What was the total amount earned from "Male" customers under the "Electronics" category?

In [None]:
gender_group = round(customer_final.pivot_table(index = "prod_cat",columns="Gender", values="total_amt", aggfunc='sum'),2)

In [None]:
gender_group

In [None]:
male_earning = gender_group.loc["Electronics","M"]

In [None]:
print("The total amount earned from Male customers under the Electronics category is",male_earning)

# (10) How many customers have more than 10 unique transactions, after removing all transactions which have any negative amounts?

In [None]:
#creating a new dataframe that does not contain transactions with negative values
pos_trans = customer_final.loc[customer_final["total_amt"]>0,:]

In [None]:
pos_trans

In [None]:
# creating a dataframe that contains unique transactions 
unique_trans = pos_trans.groupby(['customer_Id','prod_cat','prod_subcat'])['transaction_id'].count().reset_index()

In [None]:
unique_trans

In [None]:
# now finding the customers which have unique transactions greater than 10
unique_trans_count = unique_trans.groupby('customer_Id')['transaction_id'].count().reset_index()

In [None]:
unique_trans_count.head()

In [None]:
unique_trans_count[unique_trans_count['transaction_id'] > 10]

In [None]:
print('There are no unique transactions greater than 10')

# (11) For all customers aged between 25-35, find out:

### (a) What was the total amount spent for 'Electronics' and 'Books' product categories?

#### Adding new column 'age' 

In [None]:
now = pd.Timestamp('now')
customer_final['DOB'] = pd.to_datetime(customer_final['DOB'], format='%m%d%y')    # 1
customer_final['DOB'] = customer_final['DOB'].where(customer_final['DOB'] < now, customer_final['DOB'] -  np.timedelta64(100, 'Y'))   # 2
customer_final['AGE'] = (now - customer_final['DOB']).astype('<m8[Y]')

In [None]:
customer_final.head()

#### as we have to deal with customers aged between 25-35, so creating new column 'Age_cat'

In [None]:
customer_final['Age_cat'] = pd.cut(customer_final['AGE'],bins=[24,35,46,57],labels=['25-35','36-46','47-57'],include_lowest=True)

In [None]:
customer_final.head()

In [None]:
# grouping the dataframe 'customer_final' on the basis of 'Age_cat' and 'prod_cat'
customer_25_35 = customer_final.groupby(['Age_cat','prod_cat'])['total_amt'].sum()

In [None]:
customer_25_35

In [None]:
customer_25_35.loc['25-35',['Books','Electronics']]

In [None]:
print("Total amount spent on 'Electronics' and 'Books' product categories is", 
      customer_25_35.loc['25-35',['Books','Electronics']].sum().round(2))

### (b) What was the total amount spent by these customers between 1st Jan 2014 to 1st Mar 2014?

In [None]:
customer_final.head()

In [None]:
# filtering out data that belongs to the 'age_cat' = 25-35
customer_total_amount_25_35 = customer_final[customer_final['Age_cat']=='25-35']

In [None]:
customer_total_amount_25_35.head()

In [None]:
# getting all the data with transaction date between 1st Jan 2014 to 1st Mar 2014?
total_amount = customer_total_amount_25_35[(customer_total_amount_25_35['tran_date'] >='2014-01-01') & (customer_total_amount_25_35['tran_date'] <='2014-03-01')]

In [None]:
total_amount

In [None]:
print('The total amount spent by customers aged 25-35 between 1st Jan 2014 to 1st Mar 2014 is',
      total_amount['total_amt'].sum())