## Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import re
import matplotlib.pyplot as plt

## Data loading and inspecting

In [None]:
df = pd.read_csv('/kaggle/input/data-analysis-products-dataset/ProductsData.csv', encoding='latin1')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isna().sum()

It seems that most of the missing values are price values, about 25% of the records in the dataframe are incomplete

## Data cleaning

In [None]:
# Removing the quotes from the either end of the column values
df['Product_name'] = df['Product_name'].str.extract(r'^"(?P<Product>.*?)"$')
df['Product_id'] = df['Product_id'].str.extract(r'^"(?P<Product>.*?)"$')
df['Product_Category'] = df['Product_Category'].str.extract(r'^"(?P<Product>.*?)"$')
df['Professional_Publication'] = df['Professional_Publication'].str.extract(r'^"(?P<Product>.*?)"$')
df['Region_address'] = df['Region_address'].str.extract(r'^"(?P<Product>.*?)"$')
df['Local_address'] = df['Local_address'].str.extract(r'^"(?P<Product>.*?)"$')

# Replacing all empty string values with NAN
df = df.replace(r'^\s*$', np.nan, regex=True)

# Replacing the strings 
df['Product_name'] = df['Product_name'].apply(lambda x: np.nan if str(x).find('?')>-1 else x)

df['price'] = df['price'].str.replace(' ', '')

In [None]:
# Dropping all the NaN values, this will drop 1.456 incompleted records.
df.dropna(inplace=True)

# Setting the correct data types
df['price'] = pd.to_numeric(df['price'], errors='coerce')

## Exploratory Data Analysis

### Product analysis

In [None]:
sns.set_theme(style="darkgrid")
plt.figure(figsize=(15, 7))

sns.countplot(
    x=df['Professional_Publication']
)
plt.title(
    'Products sold per professional publication',
    fontdict={
        'fontsize': 16
    }
)

It seems that most products are sold to individuals rather than businesses

In [None]:
g = sns.catplot(
    kind='count',
    data=df,
    y='Region_address',
    col='Professional_Publication',
    height=7,
    aspect=2,
    sharey=False,
    order=df['Region_address'].value_counts().index,
)

g.set_titles(
    'Products sold per region',
    fontdict={
        'fontsize': 16
    }
)

It seems that most products for both private and pro orders are sold in Grand Casablanca. After which there is a clear distinction in highest orders sold for the second region. 

For private orders this is Rabat-Salé-Zemmour-Zaër and for pro orders this is Tanger-Tétouan

In [None]:
plt.figure(figsize=(15, 25))

sns.countplot(
    data=df,
    y='Local_address',
    order=df['Local_address'].value_counts().index
)

plt.title(
    'Products sold per City',
    fontdict={
        'fontsize': 16
    }
)

It should come as no surprise that Casablanca is the city with the highest amount of sold products. This as the highest amount of sold products per region was Grand Casablanca

### Price analysis

In [None]:
fig, axs = plt.subplots(ncols=2, figsize=(20, 7))

sns.kdeplot(
    data=df,
    x='price',
    hue='Professional_Publication',
    ax=axs[0]
)
sns.barplot(
    data=df,
    x='Professional_Publication',
    y='price',
    ax=axs[1]
)

axs[0].set_title('Price distribution by professional publication')
axs[1].set_title('Price distribution by professional publication')


In [None]:
df.sort_values(by=['price'], ascending=False).head(5)

The plot shows us that most of the pro orders are in the lower range, whilst the private orders also have the main distribution on the lower end, they also have the highest costing orders

The table confirms this by showing us that the 5 highest priced orders have all been private orders for villas

In [None]:
plt.figure(figsize=(15, 7))
sns.barplot(
    data=df.groupby(['Region_address'])['price'].sum().reset_index().sort_values(by=['price'], ascending=False),
    x='price',
    y='Region_address',
)
plt.title(
    'Total price of orders by region',
    fontdict={
        'fontsize': 16
    }
)