# Packages set up 

In [1]:
import pandas as pd


# Data Importation 

In [2]:
# Load the data
data = pd.read_csv("/home/dragon/GIT/robertnesterodhiambo-Data-analysis/school_of_statistics/Python/week1/sales_data_week1_500rows.csv") 
print("Shape of the dataset:", data.shape)



Shape of the dataset: (500, 7)


# Display First 5

In [3]:

# Display first 5 rows
print(data.head())

   CustomerID         Name   Age     Product  Purchase_Amount Purchase Date  \
0        1000  Steve Davis  51.0      Laptop              NaN    2024-04-20   
1        1001  Jane Miller  36.0      Tablet          1805.62    2024-12-22   
2        1002    Bob Smith  46.0      Tablet           168.44    2024-04-20   
3        1003   Emma Brown  51.0  Smartphone              NaN    2024-01-28   
4        1004  Sara Miller  50.0      Tablet           267.39    2024-03-15   

  Region  
0  South  
1  South  
2  South  
3   West  
4  South  


# Check Mising Values 

In [4]:
# Check for missing values in each column
print("\nMissing values per column:")
print(data.isnull().sum())


Missing values per column:
CustomerID           0
Name                 4
Age                 21
Product             87
Purchase_Amount     26
Purchase Date        0
Region             102
dtype: int64


# Drop rows where 'Name' or 'Product' is missing

In [5]:

# Drop rows where 'Name' or 'Product' is missing
data = data.dropna(subset=['Name', 'Product'])
data.isnull().sum()

CustomerID          0
Name                0
Age                17
Product             0
Purchase_Amount    22
Purchase Date       0
Region             80
dtype: int64

# Fill missing 'Region' with 'Unknown'

In [6]:
# Fill missing 'Region' with 'Unknown'
data['Region'] = data['Region'].fillna('Unknown')

# Fill missing 'Purchase_Amount' with the mean

In [7]:
# Fill missing 'Purchase_Amount' with the mean
mean_amount = data['Purchase_Amount'].mean()
data['Purchase_Amount'] = data['Purchase_Amount'].fillna(mean_amount)

# Convert 'Purchase Date' to datetime format

In [8]:


# Convert 'Purchase Date' to datetime format
data['Purchase Date'] = pd.to_datetime(data['Purchase Date'], errors='coerce')


# Create new column 'Purchase_Year

In [9]:
# Create new column 'Purchase_Year'
data['Purchase_Year'] = data['Purchase Date'].dt.year


# Rename all columns to lowercase and replace spaces with underscores


In [10]:

# Rename all columns to lowercase and replace spaces with underscores
data.columns = data.columns.str.lower().str.replace(' ', '_')
data

Unnamed: 0,customerid,name,age,product,purchase_amount,purchase_date,region,purchase_year
0,1000,Steve Davis,51.0,Laptop,1057.789098,2024-04-20,South,2024
1,1001,Jane Miller,36.0,Tablet,1805.620000,2024-12-22,South,2024
2,1002,Bob Smith,46.0,Tablet,168.440000,2024-04-20,South,2024
3,1003,Emma Brown,51.0,Smartphone,1057.789098,2024-01-28,West,2024
4,1004,Sara Miller,50.0,Tablet,267.390000,2024-03-15,South,2024
...,...,...,...,...,...,...,...,...
492,1492,Sara Johnson,33.0,Tablet,966.200000,2024-07-20,East,2024
493,1493,Emma Davis,36.0,Smartphone,317.660000,2024-10-19,North,2024
495,1495,Tom Wilson,32.0,Smartphone,1304.230000,2024-07-24,South,2024
496,1496,Sara Miller,59.0,Tablet,672.670000,2024-04-20,East,2024


# Rename 'purchase_amount' to 'amount_usd'

In [11]:
# Rename 'purchase_amount' to 'amount_usd'
data = data.rename(columns={'purchase_amount': 'amount_usd'})

# Filter rows where amount_usd > 1000

In [12]:
# Filter rows where amount_usd > 1000
filtered_data = data[data['amount_usd'] > 1000]

# Sort the filtered data by amount_usd in descending order

In [13]:
# Sort the filtered data by amount_usd in descending order
filtered_data = filtered_data.sort_values(by='amount_usd', ascending=False)
print("\nFiltered and sorted data (amount_usd > 1000):")
print(filtered_data)


Filtered and sorted data (amount_usd > 1000):
     customerid           name   age     product  amount_usd purchase_date  \
469        1469     Sara Davis  23.0  Smartphone     1991.76    2024-07-22   
435        1435       Jane Ali   NaN      Laptop     1987.35    2024-02-09   
315        1315        Bob Lee  57.0  Smartphone     1979.56    2024-10-19   
258        1258     Emma Brown  22.0      Laptop     1975.28    2024-01-13   
310        1310      Linda Lee  50.0      Laptop     1974.03    2024-08-20   
..          ...            ...   ...         ...         ...           ...   
342        1342  Linda Johnson  26.0  Headphones     1035.87    2024-12-21   
67         1067    Linda White  19.0  Headphones     1032.27    2024-09-16   
386        1386      Bob Smith  29.0      Laptop     1014.13    2024-09-27   
203        1203     Mike Smith  26.0  Smartphone     1007.85    2024-05-15   
232        1232     Tom Wilson  51.0  Smartphone     1001.15    2024-07-14   

      region  pu

# Group by region to get total purchases and average purchase amount

In [14]:

# Group by region to get total purchases and average purchase amount
agg_data = data.groupby('region').agg(
    total_purchases=('customerid', 'count'),
    average_amount_usd=('amount_usd', 'mean')
).reset_index()
print("\nAggregated data by region:")
print(agg_data)


Aggregated data by region:
    region  total_purchases  average_amount_usd
0     East               92         1073.592026
1    North               79         1154.316652
2    South               73          944.079814
3  Unknown               80         1065.436910
4     West               86         1041.619408


# Create 'category' column based on amount_usd

In [15]:
# Create 'category' column based on amount_usd
def categorize(amount):
    if amount >= 1000:
        return "High"
    elif amount >= 500:
        return "Medium"
    else:
        return "Low"

data['category'] = data['amount_usd'].apply(categorize)

# Save the cleaned and transformed DataFrame to a new CSV

In [16]:
# Save the cleaned and transformed DataFrame to a new CSV
data.to_csv("cleaned_data.csv", index=False)
print("\nData successfully saved to cleaned_data.csv")



Data successfully saved to cleaned_data.csv
