# **DATA CLEANING**
---

In [None]:
# Importing modules
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Data
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Alice'],
        'Age': [20, 21, None, 23, 22, 20],
        'Grade': ['A', 'B', 'C', 'B', 'A', 'A'],}


In [None]:
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

In [None]:
df.fillna({'Age': df['Age'].mean()}, inplace=True)
print(df)

In [None]:
df.fillna({'Age': 0}, inplace=True)
print(df)

In [None]:
df.drop_duplicates(inplace=True)
print(df)

In [None]:
df['Age'] = df['Age'].astype('int')
print(df)

In [None]:
# Dataset

df1 = pd.read_csv('sales_data.csv')


# Cleaning data

df1.drop_duplicates(inplace=True)
df1.index = [1,2,3,4,5]
print(df1)


# Graph between Sales and Date

fig, ax = plt.subplots(1,2, figsize=(12, 8))

sns.lineplot(x='Date', y='Sales', ax = ax[0], data=df1, marker='o', markersize=8)
ax[0].set_title('Sales over Time')
ax[0].set_xlabel('Date')
ax[0].set_ylabel('Sales')
ax[0].grid()
ax[0].set_xticks(range(len(df1['Date'])))
ax[0].set_xticklabels(df1['Date'], rotation=45)

# Graph between Sales and Category

sns.barplot(x='Product_Category', y='Sales', ax = ax[1], data=df1, errorbar=('ci', 0))
ax[1].set_title('Sales by Category')
ax[1].set_xlabel('Category')
ax[1].set_ylabel('Sales')
ax[1].grid(axis='y')

plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('sales.png', dpi=300)
plt.show()


# Heatmap

corr_matrix = df1.corr(numeric_only=True)
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.savefig('heatmap.png', dpi=300) 
plt.show()


# Regression Plot

plt.figure(figsize=(8, 6))
sns.regplot(x='Sales', y='Quantity', data=df1, color='red')
plt.title('Regression Plot')
plt.xlabel('Sales')
plt.ylabel('Quantity')
plt.grid()
plt.savefig('regression.png', dpi=300)
plt.show()