In [1]:
import pandas as pd

# Sample DataFrame
df = pd.DataFrame({
    'A': ['apple', 'banana', 'apple', 'orange', 'banana', 'apple'],
    'B': [1, 2, 1, 3, 2, 3]
})

df

Unnamed: 0,A,B
0,apple,1
1,banana,2
2,apple,1
3,orange,3
4,banana,2
5,apple,3


In [2]:
df.duplicated()

0    False
1    False
2     True
3    False
4     True
5    False
dtype: bool

In [3]:
df[df.duplicated()]

Unnamed: 0,A,B
2,apple,1
4,banana,2


In [4]:
df[~df.duplicated()]

Unnamed: 0,A,B
0,apple,1
1,banana,2
3,orange,3
5,apple,3


In [5]:
print(df.duplicated(subset=['A']))

0    False
1    False
2     True
3    False
4     True
5     True
dtype: bool


In [8]:
df[df.duplicated(subset=['A'])]

Unnamed: 0,A,B
2,apple,1
4,banana,2
5,apple,3


In [9]:
df[df.duplicated(subset=['B'])]

Unnamed: 0,A,B
2,apple,1
4,banana,2
5,apple,3


In [13]:
df.duplicated(keep='last')

0     True
1     True
2    False
3    False
4    False
5    False
dtype: bool

In [14]:
df[df.duplicated(keep='last')]

Unnamed: 0,A,B
0,apple,1
1,banana,2


In [17]:
df[df.duplicated(subset=['A'], keep='first')]

Unnamed: 0,A,B
2,apple,1
4,banana,2
5,apple,3


In [18]:
# default-- keeps the first occurence of repeated terms as non-duplicates

In [20]:
df.duplicated(keep='last')

0     True
1     True
2    False
3    False
4    False
5    False
dtype: bool

In [21]:
df[df.duplicated(keep='last')]

Unnamed: 0,A,B
0,apple,1
1,banana,2


In [22]:
import pandas as pd

# Sample DataFrame
data = {
    'TransactionID': [1, 2, 3, 4, 5, 6, 7, 8],
    'Product': ['A', 'A', 'B', 'A', 'B', 'B', 'A', 'C'],
    'Customer': ['X', 'X', 'Y', 'X', 'Y', 'Z', 'X', 'Z'],
    'Amount': [100, 100, 150, 100, 150, 200, 100, 200],
    'Timestamp': pd.to_datetime(['2021-01-01 10:00', '2021-01-01 11:00', '2021-01-01 12:00', '2021-01-02 09:30',
                                 '2021-01-02 12:30', '2021-01-02 13:00', '2021-01-03 10:30', '2021-01-03 11:00'])
}

df = pd.DataFrame(data)
df

Unnamed: 0,TransactionID,Product,Customer,Amount,Timestamp
0,1,A,X,100,2021-01-01 10:00:00
1,2,A,X,100,2021-01-01 11:00:00
2,3,B,Y,150,2021-01-01 12:00:00
3,4,A,X,100,2021-01-02 09:30:00
4,5,B,Y,150,2021-01-02 12:30:00
5,6,B,Z,200,2021-01-02 13:00:00
6,7,A,X,100,2021-01-03 10:30:00
7,8,C,Z,200,2021-01-03 11:00:00


In [23]:
df = df.sort_values(by='Timestamp')
df

Unnamed: 0,TransactionID,Product,Customer,Amount,Timestamp
0,1,A,X,100,2021-01-01 10:00:00
1,2,A,X,100,2021-01-01 11:00:00
2,3,B,Y,150,2021-01-01 12:00:00
3,4,A,X,100,2021-01-02 09:30:00
4,5,B,Y,150,2021-01-02 12:30:00
5,6,B,Z,200,2021-01-02 13:00:00
6,7,A,X,100,2021-01-03 10:30:00
7,8,C,Z,200,2021-01-03 11:00:00


In [24]:
# Calculate the time difference for each transaction
df['TimeDiff'] = df.groupby(['Product', 'Customer', 'Amount'])['Timestamp'].diff()

df

Unnamed: 0,TransactionID,Product,Customer,Amount,Timestamp,TimeDiff
0,1,A,X,100,2021-01-01 10:00:00,NaT
1,2,A,X,100,2021-01-01 11:00:00,0 days 01:00:00
2,3,B,Y,150,2021-01-01 12:00:00,NaT
3,4,A,X,100,2021-01-02 09:30:00,0 days 22:30:00
4,5,B,Y,150,2021-01-02 12:30:00,1 days 00:30:00
5,6,B,Z,200,2021-01-02 13:00:00,NaT
6,7,A,X,100,2021-01-03 10:30:00,1 days 01:00:00
7,8,C,Z,200,2021-01-03 11:00:00,NaT


In [25]:

# Mark duplicates as True if the time difference is less than 24 hours
df['IsDuplicate'] = df['TimeDiff'].dt.total_seconds().lt(24*60*60)

df


Unnamed: 0,TransactionID,Product,Customer,Amount,Timestamp,TimeDiff,IsDuplicate
0,1,A,X,100,2021-01-01 10:00:00,NaT,False
1,2,A,X,100,2021-01-01 11:00:00,0 days 01:00:00,True
2,3,B,Y,150,2021-01-01 12:00:00,NaT,False
3,4,A,X,100,2021-01-02 09:30:00,0 days 22:30:00,True
4,5,B,Y,150,2021-01-02 12:30:00,1 days 00:30:00,False
5,6,B,Z,200,2021-01-02 13:00:00,NaT,False
6,7,A,X,100,2021-01-03 10:30:00,1 days 01:00:00,False
7,8,C,Z,200,2021-01-03 11:00:00,NaT,False


In [27]:
df.isna().sum()

TransactionID    0
Product          0
Customer         0
Amount           0
Timestamp        0
TimeDiff         4
IsDuplicate      0
dtype: int64