In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

df = pd.read_csv("transactions.csv")
df.head()

df.info()



"""# **Step 2 — Feature Engineering: Create Data Features**"""

# Date Columns

df['TransactionDate'] = pd.to_datetime(df['TransactionDate'])

df['DayOfWeek'] = df['TransactionDate'].dt.dayofweek
df['Month'] = df['TransactionDate'].dt.month
df['Day'] = df['TransactionDate'].dt.day

df[['TransactionDate','DayOfWeek','Month','Day']].head()

# Choose Target

df['HighAmount'] = (df['Amount'] > 2000).astype(int)
df[['Amount','HighAmount']].head()

# Select Features

features = ['Age', 'Amount', 'Gender', 'ProductCategory', 'PaymentType',
            'DayOfWeek', 'Month', 'Day']
target = 'HighAmount'

"""# **Logistic Regression Model :**
### **Y = P(x) = (e^(β0 + β1x1)) / (1 + e^(β0 + β1x1))**
"""

# Split

X_train, X_test, y_train, y_test = train_test_split(
    df[features], df[target], test_size=0.2, random_state=42)



"""# **Step 3 — Create Preprocessing Pipeline**"""

numeric_features = ['Age', 'Amount', 'DayOfWeek', 'Month', 'Day']
categorical_features = ['Gender', 'ProductCategory', 'PaymentType']

numeric_transformer = Pipeline([
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])



"""# **Step 4 — Final Pipeline**"""

Final_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('classifier', LogisticRegression())
])



"""# **Step 5 — Train the model**"""

Final_pipeline.fit(X_train, y_train)

print("Fitted pipeline.")



"""# **Step 6 — Evaluate**"""

from sklearn.metrics import roc_auc_score, average_precision_score

proba = Final_pipeline.predict_proba(X_test)[:,1]
print("Accuracy:", round(Final_pipeline.score(X_test, y_test),3))
print("AUROC:", round(roc_auc_score(y_test, proba),3))
print("AUPRC:", round(average_precision_score(y_test, proba),3))

from sklearn.metrics import confusion_matrix, classification_report
pred = (proba >= 0.5).astype(int)
print(confusion_matrix(y_test, pred))

print(classification_report(y_test, pred, zero_division=0))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   TransactionID    1200 non-null   object 
 1   CustomerID       1200 non-null   object 
 2   Age              1200 non-null   int64  
 3   Gender           1200 non-null   object 
 4   ProductCategory  1200 non-null   object 
 5   PaymentType      1200 non-null   object 
 6   Amount           1200 non-null   float64
 7   TransactionDate  1200 non-null   object 
dtypes: float64(1), int64(1), object(6)
memory usage: 75.1+ KB
Fitted pipeline.
Accuracy: 0.979
AUROC: 0.997
AUPRC: 1.0
[[ 27   2]
 [  3 208]]
              precision    recall  f1-score   support

           0       0.90      0.93      0.92        29
           1       0.99      0.99      0.99       211

    accuracy                           0.98       240
   macro avg       0.95      0.96      0.95       240
weighted avg       

In [13]:
import os 
os.listdir()

['.ipynb_checkpoints', 'Data Pipeline .ipynb', 'transactions.csv']

In [14]:
os.getcwd()


'C:\\Users\\HP\\New folder'

In [16]:
df[['TransactionDate','DayOfWeek','Month','Day']].head()

Unnamed: 0,TransactionDate,DayOfWeek,Month,Day
0,2024-12-02,0,12,2
1,2024-12-21,5,12,21
2,2025-05-19,0,5,19
3,2024-12-15,6,12,15
4,2025-02-06,3,2,6
