In [6]:
import pandas as pd
import numpy as np

In [7]:
data = {
 "Customer_ID": [101, 102, 103, 104, 105, 106, 107, 107],
 "Name": ["Alice", "Bob", "Charlie", "David", None, "Eve", "Frank", "Frank"],
 "Age": [25, 30, np.nan, 28, 32, 29, None, 40],
 "Email": ["alice@mail.com", "bob@mail.com", "charlie@mail.com", None, 
"eve@mail.com", "eve@mail.com", "frank@mail.com", "frank@mail.com"],
 "Join_Date": ["2024-01-01", "2024-02-15", "2024-03-20", "2024-04-10", None, 
"2024-06-05", "2024-07-22", "2024-07-22"],
 "Spending ($)": [500, np.nan, 700, 400, 600, 350, 800, 800]
}
df = pd.DataFrame(data)
print(df)

   Customer_ID     Name   Age             Email   Join_Date  Spending ($)
0          101    Alice  25.0    alice@mail.com  2024-01-01         500.0
1          102      Bob  30.0      bob@mail.com  2024-02-15           NaN
2          103  Charlie   NaN  charlie@mail.com  2024-03-20         700.0
3          104    David  28.0              None  2024-04-10         400.0
4          105     None  32.0      eve@mail.com        None         600.0
5          106      Eve  29.0      eve@mail.com  2024-06-05         350.0
6          107    Frank   NaN    frank@mail.com  2024-07-22         800.0
7          107    Frank  40.0    frank@mail.com  2024-07-22         800.0


In [8]:
print(df.isnull().sum())

Customer_ID     0
Name            1
Age             2
Email           1
Join_Date       1
Spending ($)    1
dtype: int64


In [9]:
df = df.dropna(subset=["Email"])
print(df)

   Customer_ID     Name   Age             Email   Join_Date  Spending ($)
0          101    Alice  25.0    alice@mail.com  2024-01-01         500.0
1          102      Bob  30.0      bob@mail.com  2024-02-15           NaN
2          103  Charlie   NaN  charlie@mail.com  2024-03-20         700.0
4          105     None  32.0      eve@mail.com        None         600.0
5          106      Eve  29.0      eve@mail.com  2024-06-05         350.0
6          107    Frank   NaN    frank@mail.com  2024-07-22         800.0
7          107    Frank  40.0    frank@mail.com  2024-07-22         800.0


In [10]:
df["Age"].fillna(df["Age"].mean(), inplace=True) # Fill Age with average
df["Spending ($)"].fillna(df["Spending ($)"].median(), inplace=True) # Fill Spending with median
print(df)

   Customer_ID     Name   Age             Email   Join_Date  Spending ($)
0          101    Alice  25.0    alice@mail.com  2024-01-01         500.0
1          102      Bob  30.0      bob@mail.com  2024-02-15         650.0
2          103  Charlie  31.2  charlie@mail.com  2024-03-20         700.0
4          105     None  32.0      eve@mail.com        None         600.0
5          106      Eve  29.0      eve@mail.com  2024-06-05         350.0
6          107    Frank  31.2    frank@mail.com  2024-07-22         800.0
7          107    Frank  40.0    frank@mail.com  2024-07-22         800.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Age"].fillna(df["Age"].mean(), inplace=True) # Fill Age with average
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Spending ($)"].fillna(df["Spending ($)"].median(), inplace=True) # Fill Spending with median


In [11]:
df["Join_Date"] = pd.to_datetime(df["Join_Date"])
print(df.dtypes)

Customer_ID              int64
Name                    object
Age                    float64
Email                   object
Join_Date       datetime64[ns]
Spending ($)           float64
dtype: object


In [12]:
df = df.drop_duplicates()
print(df)

   Customer_ID     Name   Age             Email  Join_Date  Spending ($)
0          101    Alice  25.0    alice@mail.com 2024-01-01         500.0
1          102      Bob  30.0      bob@mail.com 2024-02-15         650.0
2          103  Charlie  31.2  charlie@mail.com 2024-03-20         700.0
4          105     None  32.0      eve@mail.com        NaT         600.0
5          106      Eve  29.0      eve@mail.com 2024-06-05         350.0
6          107    Frank  31.2    frank@mail.com 2024-07-22         800.0
7          107    Frank  40.0    frank@mail.com 2024-07-22         800.0


In [13]:
df = df.rename(columns={"Spending ($)": "Total_Spending", "Join_Date": 
"Registration_Date"})
print(df.head())

   Customer_ID     Name   Age             Email Registration_Date  \
0          101    Alice  25.0    alice@mail.com        2024-01-01   
1          102      Bob  30.0      bob@mail.com        2024-02-15   
2          103  Charlie  31.2  charlie@mail.com        2024-03-20   
4          105     None  32.0      eve@mail.com               NaT   
5          106      Eve  29.0      eve@mail.com        2024-06-05   

   Total_Spending  
0           500.0  
1           650.0  
2           700.0  
4           600.0  
5           350.0  
