In [1]:
!pip install fuzzywuzzy

import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix, hstack

from fuzzywuzzy import process





# 1. Cleaning Customer Data

In [2]:
df = pd.read_csv("a.case_cust_data_mod_v1.csv")
df.head()

Unnamed: 0,customer_unique_key,customer_salary,customer_age_year,customer_pincode,customer_city
0,3207157596,23154.0,27,122505.0,Delhi
1,3114584060,88419.0,47,411048.0,Pune
2,3343473429,73334.0,37,500081.0,Hyderabad
3,2956923386,54173.0,35,411014.0,Pune
4,3211707503,43183.0,41,560100.0,Bengaluru


In [3]:
df.isna().sum()

customer_unique_key     0
customer_salary        34
customer_age_year       0
customer_pincode       44
customer_city           1
dtype: int64

### a. Handling Out of Range Values - Age

In [4]:
df["customer_age_year"].describe()

count    29871.000000
mean        32.610224
std         57.979449
min      -9876.000000
25%         28.000000
50%         32.000000
75%         37.000000
max        190.000000
Name: customer_age_year, dtype: float64

In [5]:
out_of_range_age = df.loc[(df["customer_age_year"] < 15) | (df["customer_age_year"] > 90)].index
df.loc[out_of_range_age]

Unnamed: 0,customer_unique_key,customer_salary,customer_age_year,customer_pincode,customer_city
17,2364497836,70261.0,145,560093.0,Bengaluru
27,2819474678,27601.0,190,122001.0,Delhi
29,2425746314,52798.0,-22,411038.0,Pune
47,2895599819,33105.0,-10,411014.0,Pune
63,3199676021,36429.0,-10,122002.0,Delhi
80,2627947350,51111.0,-10,201307.0,Delhi
102,2408865558,68079.0,-10,110027.0,Delhi
125,2810364118,57550.0,-10,560016.0,Bengaluru
166,3217837462,34861.0,-10,603103.0,Chennai
206,2831377903,98632.0,-10,122001.0,Delhi


In [6]:
average_age = df["customer_age_year"].mean().round().astype(int)
df.loc[out_of_range_age, "customer_age_year"] = average_age

In [7]:
df.loc[out_of_range_age]

Unnamed: 0,customer_unique_key,customer_salary,customer_age_year,customer_pincode,customer_city
17,2364497836,70261.0,33,560093.0,Bengaluru
27,2819474678,27601.0,33,122001.0,Delhi
29,2425746314,52798.0,33,411038.0,Pune
47,2895599819,33105.0,33,411014.0,Pune
63,3199676021,36429.0,33,122002.0,Delhi
80,2627947350,51111.0,33,201307.0,Delhi
102,2408865558,68079.0,33,110027.0,Delhi
125,2810364118,57550.0,33,560016.0,Bengaluru
166,3217837462,34861.0,33,603103.0,Chennai
206,2831377903,98632.0,33,122001.0,Delhi


### b. Data Quality Check - City

In [8]:
df["customer_city"].value_counts()

Bengaluru       6094
Delhi           6088
Hyderabad       4025
Mumbai          2885
Chennai         2326
                ... 
Bhatinda           1
Kanpur City        1
Prakasam           1
Saran              1
Nagapattinam       1
Name: customer_city, Length: 195, dtype: int64

### c. Handling Missing Values - City

In [9]:
display(df.loc[df["customer_city"].isna()])
df.loc[df["customer_pincode"] == 396155.0, "customer_city"] = "Valsad"

Unnamed: 0,customer_unique_key,customer_salary,customer_age_year,customer_pincode,customer_city
10480,2979907847,43397.0,37,396155.0,


In [10]:
df.isna().sum()

customer_unique_key     0
customer_salary        34
customer_age_year       0
customer_pincode       44
customer_city           0
dtype: int64

### d. Handling Missing Values - Pincode

In [11]:
missing_pincode_index = df.loc[df["customer_pincode"].isna()].index
missing_salary_index = df.loc[df["customer_salary"].isna()].index

In [12]:
def fill_pincode(group):
    most_frequent_pincode = group['customer_pincode'].mode().iloc[0]
    group['customer_pincode'].fillna(most_frequent_pincode, inplace=True)
    return group

In [13]:
df = df.groupby('customer_city').apply(fill_pincode)

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df = df.groupby('customer_city').apply(fill_pincode)


In [14]:
df.isna().sum()

customer_unique_key     0
customer_salary        34
customer_age_year       0
customer_pincode        0
customer_city           0
dtype: int64

### e. Handling Out of Range Values - Pincode

In [15]:
df["customer_pincode"].describe()

count     29871.000000
mean     423852.473436
std      178695.140880
min           0.000000
25%      302019.000000
50%      500016.000000
75%      560066.000000
max      855107.000000
Name: customer_pincode, dtype: float64

In [16]:
out_of_range_pincode = df[df["customer_pincode"] < 100000].index
df.loc[out_of_range_pincode].count()

customer_unique_key    40
customer_salary        40
customer_age_year      40
customer_pincode       40
customer_city          40
dtype: int64

In [17]:
def replace_out_of_range_pincodes(df, city_col, pincode_col):
    """
    Replaces out-of-range pincode values with the median pincode for each city.
    
    Parameters:
    df (pd.DataFrame): The input DataFrame.
    city_col (str): The column name for the city.
    pincode_col (str): The column name for the pincode.
    
    Returns:
    pd.DataFrame: The DataFrame with out-of-range pincodes replaced.
    """
    # Calculate the median pincode for each city
    median_pincodes = df.groupby(city_col)[pincode_col].median()
    
    # Define a function to apply the median replacement
    def replace_pincode(row):
        city = row[city_col]
        pincode = row[pincode_col]
        median_pincode = median_pincodes.loc[city]
        first_digit = int(str(int(median_pincode))[0])  # Extract the first digit
        lower_bound = first_digit * 100000
        upper_bound = (first_digit + 1) * 100000 - 1
        if not (lower_bound <= pincode <= upper_bound):
            return median_pincode
        return (int(pincode))
    
    # Apply the function to the dataframe
    df[pincode_col] = df.apply(replace_pincode, axis=1)
    
    return df

# Example usage
city_col = 'customer_city'
pincode_col = 'customer_pincode'

df = replace_out_of_range_pincodes(df, city_col, pincode_col)

In [18]:
df.loc[out_of_range_pincode]

Unnamed: 0,customer_unique_key,customer_salary,customer_age_year,customer_pincode,customer_city
898,3039276973,44789.0,52,122002.0,Delhi
1724,2943292611,33299.0,30,122002.0,Delhi
1871,3216955709,94463.0,45,122002.0,Delhi
2061,3187341356,40495.0,37,122002.0,Delhi
2429,3206603016,33692.0,29,500050.0,Hyderabad
3744,2420530102,46383.0,41,122002.0,Delhi
4799,2765914746,38294.0,34,122002.0,Delhi
5425,2823320181,36858.0,39,411016.0,Pune
6005,2731412773,0.0,39,560064.0,Bengaluru
6338,2572568420,53498.0,32,570009.5,Mysore


In [19]:
pincode_check = df.groupby("customer_city")["customer_pincode"].agg({'min', 'max'}).reset_index()
pincode_check.loc[(pincode_check["max"] - pincode_check["min"]) > 99999]

Unnamed: 0,customer_city,min,max


### f. Handling Missing Values - Salary

In [20]:
df[df['customer_salary'].isna()]

Unnamed: 0,customer_unique_key,customer_salary,customer_age_year,customer_pincode,customer_city
586,3204783683,,33,414001.0,Ahmednagar
1116,2435464864,,34,500081.0,Hyderabad
4803,2802959041,,30,560103.0,Bengaluru
5346,2439666369,,30,122002.0,Delhi
6237,2620446467,,23,122002.0,Delhi
6575,3156561965,,29,600096.0,Chennai
6960,2371606970,,40,560103.0,Bengaluru
7148,3054557462,,33,122002.0,Delhi
7548,2484249391,,32,500081.0,Hyderabad
7894,3188345911,,31,600096.0,Chennai


In [21]:
df_salary_missing = df[df['customer_salary'].isna()]
df_salary_not_missing = df.dropna(subset=['customer_salary'])

X = df_salary_not_missing[['customer_age_year', 'customer_city']]
y = df_salary_not_missing['customer_salary']

# One-hot encode the customer_city
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['customer_city'])
    ],
    remainder='passthrough'
)

# Create a pipeline that first transforms the data, then fits the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict missing customer_salary values
X_missing = df_salary_missing[['customer_age_year', 'customer_city']]
predicted_salaries = model.predict(X_missing)

# Fill NaN values in customer_salary with predicted values
df.loc[df['customer_salary'].isna(), 'customer_salary'] = predicted_salaries

In [22]:
df.loc[missing_salary_index]

Unnamed: 0,customer_unique_key,customer_salary,customer_age_year,customer_pincode,customer_city
586,3204783683,45587.587014,33,414001.0,Ahmednagar
1116,2435464864,61253.455452,34,500081.0,Hyderabad
4803,2802959041,55837.069213,30,560103.0,Bengaluru
5346,2439666369,48432.892961,30,122002.0,Delhi
6237,2620446467,34161.438855,23,122002.0,Delhi
6575,3156561965,45868.447584,29,600096.0,Chennai
6960,2371606970,76224.860793,40,560103.0,Bengaluru
7148,3054557462,54549.230435,33,122002.0,Delhi
7548,2484249391,57175.897136,32,500081.0,Hyderabad
7894,3188345911,49946.0059,31,600096.0,Chennai


In [23]:
df["customer_salary"].describe()

count    2.987100e+04
mean     5.526679e+04
std      9.654519e+04
min     -1.000000e+06
25%      3.147500e+04
50%      4.160700e+04
75%      6.126150e+04
max      1.117007e+07
Name: customer_salary, dtype: float64

### g. Handling Out of Range Values - Salary

In [24]:
out_of_range_salary_index = df.loc[df["customer_salary"] < 10000].index
df.loc[out_of_range_salary_index]

Unnamed: 0,customer_unique_key,customer_salary,customer_age_year,customer_pincode,customer_city
684,3180067238,0.0,34,400708.0,Mumbai
1077,3098117558,0.0,34,400708.0,Mumbai
2411,2451850283,0.0,36,560103.0,Bengaluru
2613,2543091720,67.0,35,122002.0,Delhi
2716,2696333264,0.0,48,560103.0,Bengaluru
5722,2933203728,0.0,35,560103.0,Bengaluru
6005,2731412773,0.0,39,560064.0,Bengaluru
6198,2765261174,0.0,42,560103.0,Bengaluru
6614,2998138862,0.0,34,122002.0,Delhi
7041,2389244590,0.0,42,411001.0,Pune


In [25]:
df["customer_salary"] = abs(df["customer_salary"])
df.loc[df["customer_salary"] < 500, "customer_salary"] = df.loc[df["customer_salary"] < 500, "customer_salary"] * 0
df.loc[out_of_range_salary_index]

Unnamed: 0,customer_unique_key,customer_salary,customer_age_year,customer_pincode,customer_city
684,3180067238,0.0,34,400708.0,Mumbai
1077,3098117558,0.0,34,400708.0,Mumbai
2411,2451850283,0.0,36,560103.0,Bengaluru
2613,2543091720,0.0,35,122002.0,Delhi
2716,2696333264,0.0,48,560103.0,Bengaluru
5722,2933203728,0.0,35,560103.0,Bengaluru
6005,2731412773,0.0,39,560064.0,Bengaluru
6198,2765261174,0.0,42,560103.0,Bengaluru
6614,2998138862,0.0,34,122002.0,Delhi
7041,2389244590,0.0,42,411001.0,Pune


In [26]:
df.loc[df["customer_salary"] > 10000000]

Unnamed: 0,customer_unique_key,customer_salary,customer_age_year,customer_pincode,customer_city
24303,3185398160,11170068.0,26,141012.0,Ludhiana City


In [27]:
df.to_csv("cleaned_data/customer_data_cleaned.csv", index=False)

# 2. Cleaning Txn Data

In [28]:
df = pd.read_csv("b.case_data_sc_txn_mod_v1.csv")
df.head()

Unnamed: 0,transaction_id,customer_unique_key,transaction_status,transaction_merchant_category,transaction_amount,transaction_date,transaction_time
0,10512,2869834181,Fail,Shopping,8078.0,01-08-2021,11:06:14
1,88884,2523046473,Fail,Online Food,2138.0,01-08-2021,11:40:54
2,170010,2905056225,Fail,Shopping,122.0,01-08-2021,14:24:33
3,191864,3631953987,Fail,Shopping,288.0,01-08-2021,14:39:53
4,183790,2683540497,Fail,Shopping,3561.0,01-08-2021,17:31:17


In [29]:
df.dtypes

transaction_id                     int64
customer_unique_key                int64
transaction_status                object
transaction_merchant_category     object
transaction_amount               float64
transaction_date                  object
transaction_time                  object
dtype: object

In [30]:
df.isna().sum()

transaction_id                      0
customer_unique_key                 0
transaction_status                  0
transaction_merchant_category    1617
transaction_amount                 10
transaction_date                    0
transaction_time                    0
dtype: int64

### a. Handling Missing Values - Transaction Amount

In [31]:
df.loc[df["transaction_amount"].isna()]

Unnamed: 0,transaction_id,customer_unique_key,transaction_status,transaction_merchant_category,transaction_amount,transaction_date,transaction_time
20,982380,2871007133,Fail,Misc,,03-08-2021,10:08:25
89,2206310,2697690375,Fail,Shopping,,05-08-2021,14:01:43
160,7299520,3538707176,Fail,Shopping,,07-08-2021,23:02:26
320,23265920,3541396186,Fail,Online Food,,11-08-2021,21:10:30
51918,1048472480,2647087795,Success,Ecom,,03-10-2021,17:13:58
52059,1314852616,3810673294,Success,Grocery,,03-10-2021,19:20:47
52498,1012462500,3064615229,Success,Ecom,,04-10-2021,02:54:01
52499,3089053838,3133532984,Success,Travel,,04-10-2021,02:56:49
106338,3878541896,3238672575,Success,Grocery,,26-11-2021,16:52:12
111420,7762199775,3690266985,Success,Shopping,,06-07-2021,17:13:04


In [32]:
df.dropna(subset=['transaction_amount'], inplace=True)

In [33]:
df.isna().sum()

transaction_id                      0
customer_unique_key                 0
transaction_status                  0
transaction_merchant_category    1617
transaction_amount                  0
transaction_date                    0
transaction_time                    0
dtype: int64

### b. Data Cleaning / Combining Categories - Transaction Merchant Category

In [34]:
print(df['transaction_merchant_category'].unique())

['Shopping' 'Online Food' 'Grocery' 'Shopp$#2ing' 'Other' 'Ecom' 'Misc'
 'Gro%cery' 'ShOpping' 'Telecom' 'OnlinE Food' 'ShoppinG' 'ShoPping'
 'EcoM' 'EcoN' 'Travel' 'Shopping12$#' 'Shopping#$5' 'Fashion'
 'Shopping^$%' 'Shopping$%' 'Online Food$%7' 'Ecom$%^' nan 'Ec&om'
 'Gr^5ocery' 'E#%com' 'Te#$lecom' 'Shopping&%$' 'Shop#$%ping'
 'Fashion#$45' 'Shopping@24$' 'Travel$%' 'Other&%$' 'Fashion&%$'
 'Shopping#$#' 'Online Food#$#' 'Shopping#@#' 'Ecom#$#']


In [35]:
df['transaction_merchant_category'].value_counts()

Ecom              46478
Shopping          29942
Other             27397
Online Food       22187
Misc              10070
Fashion            7617
Travel             6879
Grocery            4014
Telecom            3904
Shopping&%$           3
ShoppinG              2
Online Food#$#        2
Gr^5ocery             1
Other&%$              1
Travel$%              1
Shopping@24$          1
Fashion&%$            1
Fashion#$45           1
Shop#$%ping           1
Shopping#$#           1
Shopping#@#           1
Te#$lecom             1
E#%com                1
Shopping^$%           1
Ec&om                 1
Ecom$%^               1
Online Food$%7        1
Shopping$%            1
Shopping#$5           1
Shopping12$#          1
EcoN                  1
EcoM                  1
ShoPping              1
OnlinE Food           1
ShOpping              1
Gro%cery              1
Shopp$#2ing           1
Ecom#$#               1
Name: transaction_merchant_category, dtype: int64

In [36]:
df.loc[df["transaction_merchant_category"].isna()]

Unnamed: 0,transaction_id,customer_unique_key,transaction_status,transaction_merchant_category,transaction_amount,transaction_date,transaction_time
380,15891600,2383149500,Fail,,331.0,13-08-2021,11:19:56
571,7302519,3185437722,Fail,,472.0,15-08-2021,14:22:54
643,33739496,2610203098,Fail,,14669.0,16-08-2021,14:33:46
948,37827096,2977301403,Fail,,417.0,19-08-2021,14:15:37
956,25884656,2615097990,Fail,,9650.0,19-08-2021,16:28:42
...,...,...,...,...,...,...,...
159906,4491692520,2673696941,Success,,575.0,04-05-2021,22:06:10
159968,12958137000,3207529517,Success,,2122.0,04-05-2021,1900-01-01T00:04:37
160101,11061679680,2987317832,Success,,8089.0,04-05-2021,1900-01-01T03:29:32
160122,10009308417,3202082340,Success,,1352.0,04-05-2021,1900-01-01T03:51:49


In [37]:
def standardize_categories(df, column, valid_categories, threshold=80):
    """
    Standardize categories in a DataFrame column using fuzzy matching.
    
    Parameters:
    df (pd.DataFrame): The input DataFrame.
    column (str): The column name to standardize.
    valid_categories (list): List of valid category names.
    threshold (int): The matching threshold for fuzzy matching.
    
    Returns:
    pd.DataFrame: The DataFrame with standardized categories.
    """
    # Replace NaN values with a placeholder
    df[column].fillna('Unknown', inplace=True)
    
    # Create a dictionary to map incorrect categories to correct ones
    category_mapping = {}
    
    # Iterate over unique values in the column
    for category in df[column].unique():
        # Find the best match in the valid categories
        match, score = process.extractOne(category, valid_categories)
        # If the score is above the threshold, map it to the best match
        if score >= threshold:
            category_mapping[category] = match
        else:
            category_mapping[category] = category  # Keep the original if no good match is found
    
    # Replace the values in the column using the mapping dictionary
    df[column] = df[column].replace(category_mapping)
    
    # Revert the placeholder back to NaN if needed
    df[column].replace('Unknown', pd.NA, inplace=True)
    
    return df

# List of valid categories
valid_categories = ['Ecom', 'Shopping', 'Other', 'Online Food', 'Misc', 
                    'Fashion', 'Travel', 'Grocery', 'Telecom']

# Standardize the categories in the 'transaction_merchant_category' column
df = standardize_categories(df, 'transaction_merchant_category', valid_categories)

# Strategy 1: Replace NaN values with 'Other'
df['transaction_merchant_category'].fillna('Unknown', inplace=True)

# Output the unique values in the 'transaction_merchant_category' column to verify the changes
print(df['transaction_merchant_category'].unique())

['Shopping' 'Online Food' 'Grocery' 'Other' 'Ecom' 'Misc' 'Telecom' 'EcoN'
 'Travel' 'Fashion' 'Unknown']


In [38]:
# Combine 'Ecom' and 'EcoN' into 'Ecom'
df['transaction_merchant_category'] = df['transaction_merchant_category'].replace({'EcoN': 'Ecom'})
df['transaction_merchant_category'].unique()

array(['Shopping', 'Online Food', 'Grocery', 'Other', 'Ecom', 'Misc',
       'Telecom', 'Travel', 'Fashion', 'Unknown'], dtype=object)

In [39]:
df['transaction_merchant_category'].value_counts()

Ecom           46485
Shopping       29958
Other          27398
Online Food    22191
Misc           10070
Fashion         7619
Travel          6880
Grocery         4016
Telecom         3904
Unknown         1617
Name: transaction_merchant_category, dtype: int64

In [40]:
df.isna().sum(), df.columns

(transaction_id                   0
 customer_unique_key              0
 transaction_status               0
 transaction_merchant_category    0
 transaction_amount               0
 transaction_date                 0
 transaction_time                 0
 dtype: int64,
 Index(['transaction_id', 'customer_unique_key', 'transaction_status',
        'transaction_merchant_category', 'transaction_amount',
        'transaction_date', 'transaction_time'],
       dtype='object'))

### c. Handling Out of Range Values - Transaction Amount

In [41]:
df["transaction_amount"].sort_values(ascending=False).head(25)

122626    9999999.0
122521    9999999.0
122572    9999999.0
122619    9999999.0
122544    9999999.0
122709    9999999.0
122793    9999999.0
122835    9999999.0
122595    9999999.0
122756    9999999.0
6252       201330.0
3210       198595.0
3256       190709.0
4653       177050.0
11792      177037.0
6028       177037.0
143887     158684.0
78692      158684.0
4650       154667.0
11093      154667.0
9558       153523.0
1655       153523.0
128380     148982.0
47693      148982.0
80906      147951.0
Name: transaction_amount, dtype: float64

In [42]:
df["transaction_amount"].sort_values().head(25)

928      -9999999.0
800      -9999999.0
554      -9999999.0
501      -9999000.0
122527    -999999.0
51707      -99999.0
39936      -99999.0
39984      -99999.0
106418     -99999.0
40174      -99999.0
51797      -99999.0
428         -9999.0
12619           1.0
3943            1.0
7309            1.0
7313            1.0
5763            1.0
3935            1.0
7328            1.0
11625           1.0
12091           1.0
7343            1.0
12095           1.0
5037            1.0
792             1.0
Name: transaction_amount, dtype: float64

In [43]:
df.loc[df["transaction_amount"] < 0]

Unnamed: 0,transaction_id,customer_unique_key,transaction_status,transaction_merchant_category,transaction_amount,transaction_date,transaction_time
428,15217540,3157198630,Fail,Ecom,-9999.0,13-08-2021,18:51:45
501,17600631,3470866048,Fail,Shopping,-9999000.0,14-08-2021,16:30:12
554,30246184,3931416708,Fail,Shopping,-9999999.0,15-08-2021,09:14:22
800,53316000,3233544252,Fail,Shopping,-9999999.0,17-08-2021,19:47:39
928,11338304,3275234777,Fail,Ecom,-9999999.0,19-08-2021,09:55:20
39936,1654551464,2368864072,Success,Shopping,-99999.0,17-09-2021,21:06:22
39984,3085079844,3415052334,Success,Ecom,-99999.0,17-09-2021,22:00:07
40174,3210665040,3020556447,Success,Telecom,-99999.0,18-09-2021,10:17:34
51707,883758519,3049723091,Success,Ecom,-99999.0,03-10-2021,14:28:40
51797,4069847430,2430365993,Success,Other,-99999.0,03-10-2021,15:26:19


In [44]:
df.shape

(160138, 7)

In [45]:
df = df[~((df['transaction_amount'] < 0) | (df['transaction_amount'] == 9999999))]
df.shape

(160116, 7)

In [46]:
df.loc[df["transaction_amount"] < 0]

Unnamed: 0,transaction_id,customer_unique_key,transaction_status,transaction_merchant_category,transaction_amount,transaction_date,transaction_time


### d. Data Quality Checks - Status / Date / Time columns

In [47]:
df["transaction_status"].unique()

array(['Fail', 'Success'], dtype=object)

In [48]:
df.iloc[[122527]]

Unnamed: 0,transaction_id,customer_unique_key,transaction_status,transaction_merchant_category,transaction_amount,transaction_date,transaction_time
122551,2728709340,3025930415,Success,Online Food,869.0,13-06-2021,1900-01-01T02:03:41


In [49]:
df["transaction_date"].dtype, df['transaction_time'].dtype

(dtype('O'), dtype('O'))

In [50]:
# Convert 'transaction_date' column to datetime.date
df['transaction_date'] = pd.to_datetime(df['transaction_date']).dt.date

# Convert 'transaction_time' column to datetime.time
df['transaction_time'] = pd.to_datetime(df['transaction_time']).dt.time

df["transaction_date"].dtype, df['transaction_time'].dtype

  df['transaction_date'] = pd.to_datetime(df['transaction_date']).dt.date


(dtype('O'), dtype('O'))

In [51]:
df.iloc[[122527]]

Unnamed: 0,transaction_id,customer_unique_key,transaction_status,transaction_merchant_category,transaction_amount,transaction_date,transaction_time
122551,2728709340,3025930415,Success,Online Food,869.0,2021-06-13,02:03:41


In [52]:
df.to_csv("cleaned_data/transaction_data_cleaned.csv", index=False)