
### Data Analysis Project using Python Modules

**Steps involved in Data Analysis Project Life Cycle**
1. Data Collection  
2. Data Understanding  
3. Data Cleaning  
4. Data Modelling  
5. Data Visualization & Analysis  



### Importing Necessary Python Modules
```python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
```


In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt



### 1. Data Collection
Reading dataset using `read_csv()`


In [None]:

df = pd.read_csv(r"/mnt/data/cleaned.csv")
df.head(5)



### 2. Data Understanding
Checking structure and summary of the dataset


In [None]:

df.head(10)


In [None]:

df.tail(10)


In [None]:

df.columns


In [None]:

df.shape


In [None]:

df.isnull().sum()


In [None]:

df.duplicated().sum()


In [None]:

df.nunique()


In [None]:

df.dtypes


In [None]:

df.info()


In [None]:

df.describe()


### Descriptive Analysis on `unitprice`

In [None]:

df['unitprice'].describe()


In [None]:

df['unitprice'].count()
df['unitprice'].mean()
df['unitprice'].min()
df['unitprice'].max()
df['unitprice'].median()
df['unitprice'].std()
df['unitprice'].var()



### 5. Data Visualization and Analysis


#### 1. Best Year of Sales

In [None]:

year_group = df.groupby('year')['item_id'].count()

plt.bar(year_group.index, year_group.values)
plt.plot(year_group.index, year_group.values)
plt.xlabel('Year')
plt.ylabel('Number of Sales')
plt.title('Sales per Year')
plt.show()


#### 2. Best Month of Sales

In [None]:

month_group = df.groupby('month')['item_id'].count()

plt.plot(month_group.index, month_group.values)
plt.xlabel('Month')
plt.ylabel('Number of Sales')
plt.title('Sales per Month')
plt.show()


#### 3. Total Sales by Year

In [None]:

year_sales = df.groupby('year')['amount'].sum()

plt.bar(year_sales.index, year_sales.values)
plt.plot(year_sales.index, year_sales.values)
plt.xlabel('Year')
plt.ylabel('Total Sales')
plt.title('Total Sales per Year')
plt.show()


#### 4. Top 10 Brands (2015-2018)

In [None]:

filtered = df[(df['year'] >= 2015) & (df['year'] <= 2018)]
brand_group = filtered.groupby('brand')['amount'].sum().nlargest(10)

plt.bar(brand_group.index, brand_group.values)
plt.xticks(rotation=90)
plt.title('Top 10 Brands')
plt.ylabel('Total Sales')
plt.show()


#### 5. Top 5 Brands Each Year

In [None]:

years = [2016, 2017, 2018]
fig, axs = plt.subplots(1,3, figsize=(15,4))

for i, yr in enumerate(years):
    top5 = df[df['year']==yr].groupby('brand')['amount'].sum().nlargest(5)
    axs[i].bar(top5.index, top5.values)
    axs[i].set_title(f'Top 5 Brands {yr}')
    axs[i].tick_params(axis='x', rotation=90)

plt.show()


#### 6. Top Categories (2015-2018)

In [None]:

category_group = filtered.groupby('category')['amount'].sum().nlargest(10)

plt.bar(category_group.index, category_group.values)
plt.xticks(rotation=90)
plt.title('Top Categories')
plt.show()


#### 7. Rating Distribution

In [None]:

ratings = df['rating'].value_counts()

plt.bar(ratings.index, ratings.values)
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.title('Rating Distribution')
plt.show()


#### 8. Most Rated Brands (2015-2018)

In [None]:

brand_rating = filtered.groupby('brand')['rating'].mean().nlargest(10)

plt.bar(brand_rating.index, brand_rating.values)
plt.xticks(rotation=90)
plt.title('Top Rated Brands')
plt.show()


#### 9. Category Sales Percentage

In [None]:

top_categories = df.groupby('category')['amount'].sum().nlargest(5)

plt.pie(top_categories.values, labels=top_categories.index, autopct='%.2f%%')
plt.title('Category Sales Percentage')
plt.show()


#### 10. Brand Sales Percentage

In [None]:

top_brands = df.groupby('brand')['amount'].sum().nlargest(5)

plt.pie(top_brands.values, labels=top_brands.index, autopct='%.2f%%')
plt.title('Brand Sales Percentage')
plt.show()


#### 11. Gender-wise Distribution

In [None]:

gender_dist = df['gender'].value_counts()

plt.pie(gender_dist.values, labels=gender_dist.index, autopct='%.1f%%')
plt.title('Gender Distribution')
plt.show()


#### 12. Gender vs Category Preferences

In [None]:

gender_category = df.groupby(['gender','category'])['quantity'].sum()
genders = df['gender'].unique()

for gender in genders:
    plt.pie(gender_category[gender], labels=gender_category[gender].index, autopct='%.1f%%')
    plt.title(f'{gender} Preferences')
    plt.show()
