In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from scipy.sparse import csr_matrix, vstack, hstack
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import lightgbm as lgb
from sklearn.metrics import classification_report, roc_auc_score
from collections import Counter

# Function to process data in chunks and convert to sparse matrix
def process_data_chunks(file_path, chunk_size, scaler, encoder, sample_frac, is_training=True):
    chunks = pd.read_csv(file_path, chunksize=chunk_size)
    sparse_chunks = []
    
    for chunk in chunks:
        # Sample a fraction of the chunk if it's for training
        if is_training:
            chunk = chunk.sample(frac=sample_frac, random_state=42)
        
        # Feature engineering
        chunk['trans_hour'] = pd.to_datetime(chunk['trans_date_trans_time'], format="%d-%m-%Y %H:%M").dt.hour
        chunk['trans_day_of_week'] = pd.to_datetime(chunk['trans_date_trans_time'], format="%d-%m-%Y %H:%M").dt.dayofweek
        
        # Scaling numerical features
        numerical_features = ['amt', 'lat', 'long', 'city_pop']
        chunk[numerical_features] = scaler.transform(chunk[numerical_features])
        
        # OneHotEncoding categorical features
        categorical_features = ['merchant', 'category', 'gender']
        encoded_features = encoder.transform(chunk[categorical_features])
        
        # Combine numerical and encoded categorical features
        sparse_data = csr_matrix(chunk[numerical_features])
        combined_data = hstack([sparse_data, encoded_features])
        sparse_chunks.append(combined_data)
    
    # Concatenate all sparse chunks into a single sparse matrix
    final_sparse_matrix = vstack(sparse_chunks)
    
    return final_sparse_matrix

# Load initial chunk to fit the scaler and encoder
initial_chunk = pd.read_csv('Test1.csv', nrows=1000)
initial_chunk['trans_hour'] = pd.to_datetime(initial_chunk['trans_date_trans_time']).dt.hour
initial_chunk['trans_day_of_week'] = pd.to_datetime(initial_chunk['trans_date_trans_time']).dt.dayofweek

# Initialize and fit scaler and encoder
numerical_features = ['amt', 'lat', 'long', 'city_pop']
categorical_features = ['merchant', 'category', 'gender']
scaler = StandardScaler()
encoder = OneHotEncoder(handle_unknown='ignore', sparse=True)
scaler.fit(initial_chunk[numerical_features])
encoder.fit(initial_chunk[categorical_features])

# Process the training dataset in chunks with a sample fraction
train_file_path = 'fraudtrain.csv'
chunk_size = 500  # smaller chunk size
sample_frac = 0.8  # sample 10% of each chunk
X_train = process_data_chunks(train_file_path, chunk_size, scaler, encoder, sample_frac, is_training=True)

# Load target variable for training and sample it accordingly
train_target = pd.read_csv('fraudtrain.csv', usecols=['is_fraud'])
y_train = train_target.sample(frac=sample_frac, random_state=42)['is_fraud']

# Handle imbalanced data with SMOTE
print("Original training dataset shape:", Counter(y_train))
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
print("Resampled training dataset shape:", Counter(y_train_res))

# Train LightGBM Classifier
model = lgb.LGBMClassifier(random_state=42, n_estimators=100, learning_rate=0.1, max_depth=3)
model.fit(X_train_res, y_train_res)

# Process the testing dataset in chunks without sampling
test_file_path = 'fraudtest.csv'
X_test = process_data_chunks(test_file_path, chunk_size, scaler, encoder, sample_frac=1.0, is_training=False)

# Load target variable for testing without sampling
test_target = pd.read_csv('fraudtest.csv', usecols=['is_fraud'])
y_test = test_target['is_fraud']

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))
print(f'ROC AUC Score: {roc_auc_score(y_test, y_pred)}')
