## **Importing required libraries**

In [None]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns


for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# assigning path of dirty_data.csv to variable dirty_data_path
dirty_data_path = '/kaggle/input/transactional-retail-dataset-of-electronics-store/dirty_data.csv'

# assigning path of missing_data.csv to variable missing_data_path
missing_data_path = '/kaggle/input/transactional-retail-dataset-of-electronics-store/missing_data.csv'

# assigning path of warehouses.csv to variable warehouses_path
warehouses_path = '/kaggle/input/transactional-retail-dataset-of-electronics-store/warehouses.csv'

## **Reading csv file**

In [None]:
dirty_data = pd.read_csv(dirty_data_path)

In [None]:
# display top 5 rows
dirty_data.head()  

## **Displaying statistical info of dirty_data**

In [None]:
dirty_data.describe() 

In [None]:
print(f'Shape of dirty_data is: {dirty_data.shape}')

## **Whether any columns contains null value or not**

In [None]:
print(dirty_data.isnull().sum())

**So no column contain null value**

## **Analyzing datatype of each column**

In [None]:
print(dirty_data.dtypes)

### **Data type of date column is object instead of datetime, So fixing this problem**

In [None]:
dirty_data.date = pd.to_datetime(dirty_data['date'])

In [None]:
print(dirty_data.date.dtypes)      

In [None]:
print(dirty_data.dtypes)  # Now data in date column is in datetime64 form

## **Analyzing order_id**

### **Checking whether order_id is unique or not**

In [None]:
if dirty_data.order_id.is_unique:    
    print("Data in order_id is unique")      # execute when 'if' block will true
else:
    print("Data in order_id is not unique")  # execute when 'else' block will true

## **Analyzing nearest_warehouse**

### **Unique item in nearest_warehouse**

In [None]:
dirty_data['nearest_warehouse'].unique()

### **Thompson and thompson are same but here they both are different so make it same**

### **Similarly Nickolson and nickolson are same**

## **To solve this problem, convert data in nearest_warehouse in title case**

In [None]:
dirty_data.nearest_warehouse = dirty_data.nearest_warehouse.str.title()

In [None]:
dirty_data['nearest_warehouse'].unique()

**Now this problem is solved**

## **Checking quantities of near_warehouse**

In [None]:
warehouses = dirty_data['nearest_warehouse'].value_counts()
print(warehouses)

## **Visualizing data using bar chart**

In [None]:
warehouses.plot(kind='bar', rot=1, color=['r','g','b'], figsize=(10,5))
plt.title("Quantity of the warehouses\n", size=15)
plt.ylabel("Counts");

## **Visualizing data using pie chart**

In [None]:
plt.pie(warehouses,
        labels=['Thomson',"Nickolson",'Bakers'],
        autopct="%0.1f%%",
        explode=[0,.01,0.1],
        startangle=60,radius=1)
plt.title("Quantity of warehouses", size=15)
plt.show()

### **It seems that Thomson is nearest warehouse accessible to mostly customer**

### **Analyzing order_price and order_total**
- order_price: Price before any discounts and delivery charges
- order_total: Price after all discounts and/or delivery charges are applied

## **Displaying Record whose order price is maximum**

In [None]:
dirty_data[dirty_data.order_price == dirty_data.order_price.max()]

## **Displaying Record whose order price is minimum**

In [None]:
dirty_data[dirty_data.order_price == dirty_data.order_price.min()]

## **Now checking whether the order_total(after all discount and delivery charges are applied) is correct or there is some problem in it**

In [None]:
dirty_data.loc[:,['order_price','delivery_charges','coupon_discount','order_total']].head(10)

## **Computing and Displaying order_total price of first 10 records**

In [None]:
for i, index in enumerate(range(0,10)):
    if (dirty_data['coupon_discount'][i] != 0):
        dis_price = (dirty_data['order_price'][i]) * (dirty_data['coupon_discount'][i] / 100)   # apply coupon to original price
        price = dis_price + dirty_data['delivery_charges'][i]         # adding delivery charges
        final_price = np.round(dis_price,2)
    
    else:
        final_price = dirty_data['order_price'][i] + dirty_data['delivery_charges'][i]   # adding order_price and delivery_charges
        final_price = np.round(final_price,2)    # round off upto 2nd decimal place

    print(f'Order total of record at index {index} is: {final_price} ')

### **It looks like that there is an error in order_total feature of dirty_data, So fixing this**

In [None]:
lis = list()     # creating an empty list

## **Computing and appending order_final in lis**

In [None]:
for i in range(0,len(dirty_data['order_price'])):
    if (dirty_data['coupon_discount'][i] != 0):
        dis_price = (dirty_data['order_price'][i]) * (dirty_data['coupon_discount'][i] / 100)   # apply coupon to original price
        price = dis_price + dirty_data['delivery_charges'][i]         # adding delivery charges
        final_price = np.round(dis_price,2)
        lis.append(final_price)               # appending final_price to lis
    
    else:
        final_price = dirty_data['order_price'][i] + dirty_data['delivery_charges'][i]   # adding order_price and delivery_charges
        final_price = np.round(final_price,2)    # round off upto 2nd decimal place
        lis.append(final_price)                  # appending final_price to lis


In [None]:
print(f'Items (Price) in list is: {len(lis)}')

### **Converting list to Series and set it in order_total**

In [None]:
dirty_data['order_total'] = pd.Series(lis)

**Now the problem with order_total is solved**

## **Removing outlier**
- Using IQR

### **Checking for Outlier in order_total**

In [None]:
Q1_order_total = dirty_data['order_total'].quantile(0.25)     # Lower Quartile
Q3_order_total = dirty_data['order_total'].quantile(0.75)     # Upper Quartile

In [None]:
print(f'Lower Quartile is: {Q1_order_total}') 
print(f'Upper Quartile is: {Q3_order_total}')

## **Finding IRQ**

In [None]:
# upper quartile - lower quartile

IQR_order_total = Q3_order_total - Q1_order_total    

In [None]:
print(f'Value of IQR_order_total is: {IQR_order_total}')

## **Finding lower and upper limit**

In [None]:
lower_limit_order_total = Q1_order_total - 1.5 * IQR_order_total
upper_limit_order_total = Q3_order_total + 1.5 * IQR_order_total

In [None]:
print(f'Lower limit of order_total is: {lower_limit_order_total}')
print(f'Upper limit of order_total is: {upper_limit_order_total}')

### **Any value less then lower limit and greater then upper limit is outlier**

## **Displaying all outlier values**

In [None]:
outlier_dirty_data = dirty_data[(dirty_data['order_total'] < lower_limit_order_total) | (dirty_data['order_total'] > upper_limit_order_total)]

In [None]:
outlier_dirty_data

## **Above records is outlier, So removing it from original data**

In [None]:
print(f'Total outlier records in dirty_data is: {outlier_dirty_data.shape[0]}')

### **Any data which is greater than lower limit and less than upper limit is Not an Outlier**

In [None]:
data_without_outlier = dirty_data[(dirty_data['order_total'] > lower_limit_order_total) & (dirty_data['order_total'] < upper_limit_order_total)]

In [None]:
print(f'Total numbers of records without outliers is: {data_without_outlier.shape[0]}')

## **Displaying top 5 records of data without outliers**

In [None]:
data_without_outlier.head()

## **Now Analyzing how many customers are happy with sevice**

In [None]:
data_without_outlier['is_happy_customer'].unique()

In [None]:
satisfaction = data_without_outlier['is_happy_customer'].value_counts()
print(satisfaction)

## **Visualizing Data**

In [None]:
plt.figure(figsize=(15,5))

plt.subplot(121)
satisfaction.plot(kind='bar', color=['g','r'], rot=1)
plt.xticks(range(2), labels=['Satisfy','Not Satifty'])
plt.ylabel('Counts')
plt.title("Customer satisfaction Vs Not satisfaction\n", size=15)


plt.subplot(122)
plt.pie(satisfaction, labels=['Satisfy','Not Satifty'], autopct="%.1f%%",explode=[0,0.1],shadow=True)
plt.title("Percentage of Customer satisfaction Vs Not satisfaction\n", size=15)

plt.show()

## **70.3% customers are satisfy with service**