## Importing Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Importing Dataset

In [None]:
df=pd.read_csv("../input/ecommerce-uk-retailer/Ecommerce.csv",encoding= 'unicode_escape')

In [None]:
df

## Cleaning unwanted data

In [None]:
df[df.isnull().any(axis="columns")]

In [None]:
df.info()

In [None]:
null_col=df[df.isnull().any(axis="columns")].index

In [None]:
df.drop(null_col,inplace=True)

In [None]:
df.info()

In [None]:
df["InvoiceDate"]=pd.to_datetime(df["InvoiceDate"])
df["UnitPrice"]=df["UnitPrice"].astype('float')
df["CustomerID"]=df["CustomerID"].astype('int')


In [None]:
df.info()

In [None]:
neg_quantity=df[df['Quantity']<=0].index

In [None]:
df.drop(neg_quantity,inplace=True)

In [None]:
df

## 1. Perform Basic EDA

  ## a. Boxplot – All Numeric Variables

In [None]:
fig,ax=plt.subplots(1,2,figsize=(10,5))
ax[0].boxplot(df["Quantity"])
ax[0].set_xticklabels(["Quantity"])
ax[0].set_ylabel('Number of Quantities')
ax[1].boxplot(df["UnitPrice"])
ax[1].set_xticklabels(["UnitPrice"])
ax[1].set_ylabel("Price")
plt.show()

## b. Histogram – All Numeric Variables

In [None]:
df1=df[df["Quantity"]<10000]
df2=df1[df1["UnitPrice"]<3000]

In [None]:
fig,ax=plt.subplots(1,2,figsize=(10,5))
ax[0].hist(df2["Quantity"],label="Quantity",bins=[500,1000,2000,3000,4000,5000,6000,7000,8000,9000])
ax[0].set_ylabel("Frequency")
ax[0].set_xlabel('Quantity')
ax[0].set_title("Quantity Distribution")
ax[1].hist(df2["UnitPrice"],label="Price",bins=[100,500,1000,2000,3000,4000,5000,6000,7000,8000,9000])
ax[1].set_ylabel("Frequency")
ax[1].set_xlabel('Price')
ax[1].set_title("Price Distribution")
plt.legend()
plt.show()

## c. Distribution Plot – All Numeric Variables

In [None]:
df3=df2[df["Quantity"]<2000]


In [None]:
sns.distplot(df3["Quantity"])
plt.show()

In [None]:
sns.distplot(df3["UnitPrice"])
plt.show()

### d. Aggregation for all numerical Columns

In [None]:
df3.describe()

### e. Unique Values across all columns

In [None]:
df3.nunique()

### f. Duplicate values across all columns

In [None]:
df3[df3.duplicated()]

### g. Correlation – Heatmap - All Numeric Variables

In [None]:
sns.heatmap(df3.corr(),annot=True)
plt.show()

### h. Regression Plot - All Numeric Variables

In [None]:
sns.set_style('whitegrid')
sns.lmplot(x ='Quantity', y ='UnitPrice', data = df3)
plt.show()

### i. Bar Plot – Every Categorical Variable vs every Numerical Variable

In [None]:
df4=df3.groupby(['Country']).agg({'Quantity':"count"})

In [None]:
df4

In [None]:
plt.subplots(figsize=(10,6))
plt.barh(df4.index,df4['Quantity'])
plt.xticks(rotation="vertical")
plt.show()

In [None]:
df5=df3.groupby(['Country']).agg({'UnitPrice':"sum"})

In [None]:
df5

In [None]:
plt.subplots(figsize=(10,6))
plt.barh(df5.index,df5['UnitPrice'])
plt.xticks(rotation="vertical")
plt.show()

### j. Pair plot - All Numeric Variables

In [None]:
sns.pairplot(df3 , hue = 'UnitPrice' , palette = 'coolwarm')
plt.show()

### k. Line chart to show the trend of data - All Numeric/Date Variables

In [None]:
plt.plot(df3["InvoiceDate"],df3["Quantity"],linestyle ='solid')
plt.show()

### l. Plot the skewness - All Numeric Variables

In [None]:
df4=df3[df3["Quantity"]<200]
sns.distplot(df4["Quantity"],kde=False)
plt.ylabel("Frequency")
median_price = round(df4["Quantity"].median(), 2)
mean_price = round(df4["Quantity"].mean(), 2)
names = ["median", "mean"]
colors = ['darkmagenta', 'darkorange']
measurements = [median_price, mean_price]
for measurement, name, color in zip(measurements, names, colors):
    plt.axvline(x=measurement, linestyle='--', linewidth=2.5, label='{0} at {1}'.format(name, measurement), c=color)
plt.legend();

plt.show()

As mean is greater than median the quantities are positively skewed.

In [None]:
df4=df3[df3["UnitPrice"]<200]
sns.distplot(df4["UnitPrice"],kde=False)
plt.ylabel("Frequency")
median_price = round(df4["UnitPrice"].median(), 2)
mean_price = round(df4["UnitPrice"].mean(), 2)
names = ["median", "mean"]
colors = ['darkmagenta', 'darkorange']
measurements = [median_price, mean_price]
for measurement, name, color in zip(measurements, names, colors):
    plt.axvline(x=measurement, linestyle='--', linewidth=2.5, label='{0} at {1}'.format(name, measurement), c=color)
plt.legend();

plt.show()

As mean is greater than median the Unit prices are positively skewed.

## 2. Check for missing values in all columns and replace them with the appropriate metric
(Mean/Median/Mode)

In [None]:
df_new=pd.read_csv("../input/ecommerce-uk-retailer/Ecommerce.csv",encoding= 'unicode_escape')

In [None]:
df_new

In [None]:
df_new.info()

In [None]:
df_new.isnull().sum()

In [None]:
df_new["CustomerID"]=df_new["CustomerID"].fillna(df_new["CustomerID"].mode()[0])

In [None]:
df_new.isnull().sum()

In [None]:
df_new["Description"]=df_new["Description"].fillna(df_new["Description"].mode()[0])

In [None]:
df_new.isnull().sum()

In [None]:
sns.boxplot(df_new["CustomerID"])
plt.show()

## 3.Remove duplicate rows

In [None]:
df_new.drop_duplicates(inplace=True)

In [None]:
df_new[df_new.duplicated()]

## 4. Remove rows which have negative values in Quantity column

In [None]:
index1=df_new[df_new["Quantity"]<0].index

In [None]:
df_new.drop(index1,inplace=True)

## 5. Add the columns - Month, Day and Hour for the invoice

In [None]:
df_new['InvoiceDate'] = df_new['InvoiceDate'].astype('datetime64[ns]')
df_new['Year'] = df_new.InvoiceDate.dt.year
df_new['Month'] = df_new.InvoiceDate.dt.month
df_new['Hour'] = df_new.InvoiceDate.dt.hour

In [None]:
df_new['day'] = df_new['InvoiceDate'].dt.day_name()

In [None]:
df_new.drop(['day'],axis=1)

## 6. How many orders made by the customers?

In [None]:
df_new['Quantity'].value_counts().sum()

In [None]:
orders=df_new.groupby(by=['CustomerID','Country'],sort=True)['InvoiceNo'].count().head()

In [None]:
orders

## 7. TOP 5 customers with higher number of orders

In [None]:
top5=df_new.groupby(by=['CustomerID'],as_index=False)['Quantity'].sum().head()

In [None]:
top5

In [None]:
plt.subplots(figsize=(15,6))
plt.bar(top5.CustomerID,top5.Quantity)
plt.xlabel('Customers ID')
plt.ylabel('Number of Orders')
plt.title('TOP 5 customers with higher number of orders')
plt.show()

## 8. How much money spent by the customers?

In [None]:
df_new["Revenue"]=df_new["Quantity"]*df_new["UnitPrice"]

In [None]:
df_new["Revenue"]

In [None]:
money_spent = df_new.groupby(by=['CustomerID','Country']).agg({"Revenue":sum})

In [None]:
money_spent.head()

## 9.Top 5 customer with their country with highest money spent

In [None]:
m =money_spent.sort_values(by='Revenue', ascending=False).head()
m

## 10. How many orders per month?

In [None]:
orders_permonth=df_new.groupby(df_new["Month"]).agg({"InvoiceNo":"count"})

In [None]:
orders_permonth

In [None]:
plt.subplots(figsize=(10,6))
plt.bar(orders_permonth.index,orders_permonth.InvoiceNo)
plt.xlabel('Month of year')
plt.ylabel('Number of Orders')
plt.title('No. of Orders per month')
plt.show()

## 11. How many orders per day?

In [None]:
orders_perday=df_new.groupby(df_new["day"]).agg({"InvoiceNo":"count"})

In [None]:
orders_perday=orders_perday.sort_values('InvoiceNo',ascending=False)

In [None]:
plt.subplots(figsize=(10,6))
plt.bar(orders_perday.index,orders_perday.InvoiceNo)
plt.xlabel('Day')
plt.ylabel('Number of Orders')
plt.title('No. of Orders per day')
plt.show()

## 12. How many orders per hour?

In [None]:
df_new['Hour']=df_new['InvoiceDate'].dt.hour


In [None]:
df_new['Hour']

In [None]:
orders_perhour=df_new.groupby(df_new["Hour"]).agg({"InvoiceNo":"count"})

In [None]:
orders_perhour

In [None]:
plt.subplots(figsize=(10,6))
plt.bar(orders_perhour.index,orders_perhour.InvoiceNo)
plt.xlabel('Hour')
plt.ylabel('Number of Orders')
plt.title('No. of Orders per hor')
plt.show()

## 13. How many orders for each country?

In [None]:
orders_percountry=df_new.groupby(df_new["Country"]).agg({"InvoiceNo":"count"})

In [None]:
orders_percountry

## 14. Orders trend across months

In [None]:
# trend plot
fig = plt.figure(figsize=(15, 7))
x1 = fig.add_subplot(111)

x1.set_xlabel('Month',fontsize=15)
x1.set_ylabel('Total Amount Spent ($)',fontsize=15)
x1.set_title('Total Amount Spent for Months',fontsize=15)

x1.bar(df_new.groupby(by=['Month'])['Revenue'].sum().index.tolist()[1:],
        df_new.groupby(by=['Month'])['Revenue'].sum()[1:],
        alpha=0.85,#If you want to make the graph plot more transparent, then you can make alpha less than 1, such as 0.5 or 0.25.
        label='Amount Spent by Month')

x2 = x1.twinx()# tread line

x2.set_ylabel('Percentage Change of Total Amount Spent (%)',fontsize=15)

#pct_change()Percentage change between the current and a prior element.
x2.plot(df_new.groupby(by=['Month'])['Revenue'].sum().index.tolist()[1:], 
         df_new.groupby(by=['Month'])['Revenue'].sum()[1:].pct_change().fillna(0)*100,
         label='Percentage change of total amount spent (%)',
         color='red')

x1.legend(loc='upper left')
x2.legend(loc='upper right')

fig.tight_layout()
plt.show()

## 15. How much money spent by each country?

In [None]:
money_percountry=df_new.groupby(df_new["Country"]).agg({"Revenue":"sum"})


In [None]:
money_percountry