In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from imblearn.under_sampling import RandomUnderSampler
from scipy.sparse import hstack

In [2]:
data = pd.read_csv('final_combined.csv')

In [3]:
data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,...,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,IsFraud,GeneratedDescription,NeutralDescription
0,0,687728,2019-10-20 14:41:47,3590736522064285,"Reichert, Rowe and Mraz",shopping_net,213.34,Kimberly,Gonzalez,F,...,4074,"Scientist, audiological",1975-12-20,f9ae9fecd1d34a688940a198eba3c94d,1350744107,34.12298,-92.069393,0,"On October 20, 2019, Kimberly Gonzalez, a 43-y...",",,, and the location linked with the merchant ..."
1,1,399382,2019-06-30 20:40:01,4378993458389626,"Dare, Fritsch and Zboncak",health_fitness,19.28,Travis,Hayes,M,...,3807,Surgeon,1999-10-25,813fe5007c3fcda8beeff248fc33934b,1341088801,43.500503,-71.711188,0,"Transaction on June 30, 2019, at 20:40:01: A c...","Transaction on June 30, 2019, at 20:40:01: A c..."
2,2,135829,2019-03-15 23:31:10,4659625317833446400,Gerlach Inc,shopping_net,1094.12,Richard,Moore,M,...,10085,Hospital doctor,1964-11-18,896b9f6baea6ebb23fb2c32f04e512d0,1331854270,40.419938,-87.884809,1,"Transaction Alert: On March 15, 2019, at 23:31...","of $1,094.12 was recorded at Gerlach Inc, a le..."
3,3,51834,2019-01-31 12:55:44,2383461948823908,Schoen Ltd,kids_pets,42.76,Patrick,Vaughan,M,...,1892,Audiological scientist,1956-03-02,29618f988b5399c823e79fd14059a0f4,1328014544,43.545333,-71.283994,0,Patrick Vaughan made a purchase of $42.76 at S...,Patrick Vaughan made a purchase of $42.76 at S...
4,4,349639,2019-06-13 12:08:46,3531129874770000,Schiller Ltd,personal_care,8.58,Shelby,Mitchell,F,...,5895,"Scientist, marine",1975-07-13,ca13f52b435a5e0acca6d7ae062e0cd8,1339589326,42.940987,-73.498177,0,"On June 13, 2019, Shelby Mitchell made a modes...",of the transaction. Keep the summary extremely...


In [4]:
data['IsFraud'].value_counts()

IsFraud
0    10419
1     2980
Name: count, dtype: int64

In [5]:
exclude_columns = ['GeneratedDescription', 'IsFraud']
feature_columns = [col for col in data.columns if col not in exclude_columns]
X = data[feature_columns]
y = data['IsFraud']

In [6]:
X.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,NeutralDescription
0,0,687728,2019-10-20 14:41:47,3590736522064285,"Reichert, Rowe and Mraz",shopping_net,213.34,Kimberly,Gonzalez,F,...,34.5091,-92.4828,4074,"Scientist, audiological",1975-12-20,f9ae9fecd1d34a688940a198eba3c94d,1350744107,34.12298,-92.069393,",,, and the location linked with the merchant ..."
1,1,399382,2019-06-30 20:40:01,4378993458389626,"Dare, Fritsch and Zboncak",health_fitness,19.28,Travis,Hayes,M,...,43.9742,-71.1503,3807,Surgeon,1999-10-25,813fe5007c3fcda8beeff248fc33934b,1341088801,43.500503,-71.711188,"Transaction on June 30, 2019, at 20:40:01: A c..."
2,2,135829,2019-03-15 23:31:10,4659625317833446400,Gerlach Inc,shopping_net,1094.12,Richard,Moore,M,...,39.6591,-87.4208,10085,Hospital doctor,1964-11-18,896b9f6baea6ebb23fb2c32f04e512d0,1331854270,40.419938,-87.884809,"of $1,094.12 was recorded at Gerlach Inc, a le..."
3,3,51834,2019-01-31 12:55:44,2383461948823908,Schoen Ltd,kids_pets,42.76,Patrick,Vaughan,M,...,43.745,-70.9092,1892,Audiological scientist,1956-03-02,29618f988b5399c823e79fd14059a0f4,1328014544,43.545333,-71.283994,Patrick Vaughan made a purchase of $42.76 at S...
4,4,349639,2019-06-13 12:08:46,3531129874770000,Schiller Ltd,personal_care,8.58,Shelby,Mitchell,F,...,43.8065,-73.0882,5895,"Scientist, marine",1975-07-13,ca13f52b435a5e0acca6d7ae062e0cd8,1339589326,42.940987,-73.498177,of the transaction. Keep the summary extremely...


In [7]:
y.value_counts()

IsFraud
0    10419
1     2980
Name: count, dtype: int64

In [8]:
rus = RandomUnderSampler(sampling_strategy=0.4, random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

In [9]:
print(X_resampled.shape[0])

10430


In [10]:
X_resampled.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,NeutralDescription
3455,3456,114003,2019-03-05 15:49:45,180049032966888,Wuckert-Goldner,home,75.7,Michael,Flores,M,...,46.4959,-90.4383,795,Television/film/video producer,1986-04-15,bba0896e411a7c24028d8d4f9c326e11,1330962585,47.491476,-91.22375,: Michael Flores made a purchase at Wuckert-Go...
11685,11686,28890,2019-01-17 23:32:42,4586260469584,Stark-Batz,entertainment,28.36,Melody,Thompson,F,...,40.1362,-95.2138,1631,Architect,1953-01-20,42ea67e3f4e1cbcb91059dbe0a5d1d02,1326843162,40.168422,-95.451159,Melody Thompson enjoyed a delightful evening i...
3121,3122,1119444,2020-04-11 09:27:17,4715741951931168768,"Robel, Cummerata and Prosacco",gas_transport,43.93,Robert,Drake,M,...,36.6966,-96.7869,471,Sub,1941-03-30,9f773e15d813d960ee235ffd58fd23ff,1365672437,37.375919,-95.819717,": Robert Drake Purchase Location: Robel, Cumme..."
11939,11940,798729,2019-12-03 20:51:04,38588538868506,Wuckert-Goldner,home,26.53,Jacqueline,Curry,F,...,30.1886,-103.2214,498,Lexicographer,1990-11-23,ba74059058f321815c5cb3b2e34b865b,1354567864,29.844679,-104.087083,"$26.53, transaction was made online at Wuckert..."
6543,6544,1165176,2020-05-01 09:50:33,4247921790666,Rodriguez Group,gas_transport,77.53,Judith,Moss,F,...,39.537,-83.455,22305,Television floor manager,1939-03-09,e37a4577f6bb466ebc7a678cbe2a96da,1367401833,40.218795,-82.932704,", Judith Moss's transaction on May 1, 2020, fo..."


In [11]:
y_resampled.value_counts()

IsFraud
0    7450
1    2980
Name: count, dtype: int64

In [12]:
tfidf = TfidfVectorizer(max_features=1000)
X_text_tfidf = tfidf.fit_transform(X_resampled['NeutralDescription'])


In [13]:
X_resampled = X_resampled.drop(columns=['NeutralDescription'])
X_encoded = pd.get_dummies(X_resampled, drop_first=True)
X_encoded_sparse = hstack([X_text_tfidf, X_encoded.astype(float)])
# Check the shape of the combined feature matrix
print("Shape of the final feature matrix:", X_encoded_sparse.shape)

# Number of features is the number of columns in the combined matrix
print("Total number of features after hstack:", X_encoded_sparse.shape[1])

Shape of the final feature matrix: (10430, 26771)
Total number of features after hstack: 26771


In [None]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=100)  # Adjust n_components as needed
X_reduced = svd.fit_transform(X_encoded_sparse)


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded_sparse, y_resampled, test_size=0.3, random_state=42, stratify=y_resampled)


In [15]:
regularizations = ['l1', 'l2', None]
results = {}

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

def train_and_test_logistic_regression(X_train, X_test, y_train, y_test, penalty=None, solver='lbfgs', max_iter=1000):
    try:
        # Initialize and train the model
        model = LogisticRegression(penalty=penalty, solver=solver, max_iter=max_iter, class_weight='balanced', random_state=42)
        model.fit(X_train, y_train)
        
        # Make predictions and return classification report
        y_pred = model.predict(X_test)
        report = classification_report(y_test, y_pred, output_dict=True)
        
        # Return model and report for coefficient extraction
        return model, report
    except Exception as e:
        print(f"Error training model with penalty={penalty}: {e}")
        return None, None


# Scale the data
scaler = StandardScaler(with_mean=False)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train models with different penalties
results = {}
models = {}

# No regularization
models['none'], results['none'] = train_and_test_logistic_regression(X_train_scaled, X_test_scaled, y_train, y_test, penalty=None, solver='lbfgs')

# L1 regularization
models['l1'], results['l1'] = train_and_test_logistic_regression(X_train_scaled, X_test_scaled, y_train, y_test, penalty='l1', solver='liblinear', max_iter=5000)

# L2 regularization
models['l2'], results['l2'] = train_and_test_logistic_regression(X_train_scaled, X_test_scaled, y_train, y_test, penalty='l2', solver='lbfgs', max_iter=1000)

# Create a DataFrame to store classification results
results_df = pd.DataFrame({k: pd.Series(v['macro avg']) for k, v in results.items() if v is not None})
print(results_df)
results_df.to_csv('logistic_regression_results.csv', index=False)
print("Results saved to logistic_regression_results.csv")
pd.set_option('display.max_rows', None)
# Print coefficients for L1 and L2 models
feature_names = [f'Feature_{i}' for i in range(X_train_scaled.shape[1])]  # Replace with actual feature names if available
for reg_type in ['l1', 'l2']:
    if models[reg_type] is not None:
        print(f"\nCoefficients for {reg_type.upper()} regularization:")
        coefficients = models[reg_type].coef_[0]  # Coefficients for the first class (binary classification)
        coeff_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})
        print(coeff_df.sort_values(by='Coefficient', ascending=False))



                  none           l1           l2
precision     0.973195     0.985352     0.974419
recall        0.956040     0.971588     0.947427
f1-score      0.964127     0.978163     0.959737
support    3129.000000  3129.000000  3129.000000
Results saved to logistic_regression_results.csv

Coefficients for L1 regularization:
             Feature   Coefficient
1003    Feature_1003  3.327831e+00
762      Feature_762  1.441048e+00
124      Feature_124  1.407017e+00
763      Feature_763  1.152537e+00
312      Feature_312  7.461179e-01
734      Feature_734  7.292909e-01
849      Feature_849  7.128658e-01
714      Feature_714  6.057081e-01
165      Feature_165  5.936864e-01
306      Feature_306  5.710365e-01
735      Feature_735  5.464939e-01
12134  Feature_12134  4.999568e-01
893      Feature_893  4.737949e-01
895      Feature_895  4.439680e-01
468      Feature_468  4.120246e-01
215      Feature_215  4.072055e-01
432      Feature_432  4.034689e-01
835      Feature_835  3.881205e-01
850 

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

def train_and_test_logistic_regression(X_train, X_test, y_train, y_test, penalty=None, solver='lbfgs', max_iter=1000):
    
    try:
        
        model = LogisticRegression(penalty=penalty, solver=solver, max_iter=max_iter, class_weight='balanced', random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        return classification_report(y_test, y_pred, output_dict=True)
    except Exception as e:
        print(f"Error training model with penalty={penalty}: {e}")
        return None


scaler = StandardScaler(with_mean=False)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


results = {}
results['none'] = train_and_test_logistic_regression(X_train_scaled, X_test_scaled, y_train, y_test, penalty=None, solver='lbfgs')
results['l1'] = train_and_test_logistic_regression(X_train_scaled, X_test_scaled, y_train, y_test, penalty='l1', solver='liblinear', max_iter=5000)
results['l2'] = train_and_test_logistic_regression(X_train_scaled, X_test_scaled, y_train, y_test, penalty='l2', solver='lbfgs', max_iter=1000)
results_df = pd.DataFrame({k: pd.Series(v['macro avg']) for k, v in results.items() if v is not None})
print(results_df)
results_df.to_csv('logistic_regression_results.csv', index=False)
print("Results saved to logistic_regression_results.csv")


                  none           l1           l2
precision     0.973195     0.985352     0.974419
recall        0.956040     0.971588     0.947427
f1-score      0.964127     0.978163     0.959737
support    3129.000000  3129.000000  3129.000000
Results saved to logistic_regression_results.csv


In [18]:
from sklearn.model_selection import cross_val_score
cv_results = {}


for reg in regularizations:
    if reg:
        solver = 'liblinear' if reg == 'l1' else 'saga'  
        model = LogisticRegression(penalty=reg, solver=solver, max_iter=5000, class_weight='balanced', random_state=42)
    else:
        model = LogisticRegression(penalty=None, solver='lbfgs', max_iter=5000, class_weight='balanced', random_state=42)
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='f1_macro')
    cv_results[reg or 'none'] = {
        'cv_scores': cv_scores,
        'mean_cv_score': cv_scores.mean()
    }
for reg, results in cv_results.items():
    print(f"Regularization: {reg or 'none'}")
    print(f"Cross-validation scores: {results['cv_scores']}")
    print(f"Mean CV Score: {results['mean_cv_score']:.4f}")
    print("-" * 40)


Regularization: l1
Cross-validation scores: [0.97799444 0.975283   0.96665884 0.98212844 0.97191612]
Mean CV Score: 0.9748
----------------------------------------
Regularization: l2
Cross-validation scores: [0.96661657 0.9564783  0.95193226 0.95854981 0.9573144 ]
Mean CV Score: 0.9582
----------------------------------------
Regularization: none
Cross-validation scores: [0.97217247 0.96832026 0.95765298 0.96577783 0.96650492]
Mean CV Score: 0.9661
----------------------------------------


In [19]:
from sklearn.model_selection import cross_val_score
cv_results = {}


for reg in regularizations:
    if reg:
        solver = 'liblinear' if reg == 'l1' else 'saga'  
        model = LogisticRegression(penalty=reg, solver=solver, max_iter=5000, class_weight='balanced', random_state=42)
    else:
        model = LogisticRegression(penalty=None, solver='lbfgs', max_iter=5000, class_weight='balanced', random_state=42)
    cv_scores = cross_val_score(model, X_test, y_test, cv=5, scoring='f1_macro')
    cv_results[reg or 'none'] = {
        'cv_scores': cv_scores,
        'mean_cv_score': cv_scores.mean()
    }
for reg, results in cv_results.items():
    print(f"Regularization: {reg or 'none'}")
    print(f"Cross-validation scores: {results['cv_scores']}")
    print(f"Mean CV Score: {results['mean_cv_score']:.4f}")
    print("-" * 40)




Regularization: l1
Cross-validation scores: [0.96443182 0.96467446 0.96225888 0.9511833  0.97241349]
Mean CV Score: 0.9630
----------------------------------------
Regularization: l2
Cross-validation scores: [0.416589   0.416589   0.416589   0.416589   0.41697761]
Mean CV Score: 0.4167
----------------------------------------
Regularization: none
Cross-validation scores: [0.416589   0.416589   0.416589   0.416589   0.41697761]
Mean CV Score: 0.4167
----------------------------------------


In [20]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Define the logistic regression model
model = LogisticRegression(penalty='l2', solver='saga', max_iter=5000, class_weight='balanced', random_state=42)

# Perform cross-validation on test data
# Here, X_test and y_test should already be defined in the notebook
cv_scores = cross_val_score(model, X_test, y_test, cv=5, scoring='f1_macro')  # Using F1-macro for imbalanced classes

# Print cross-validation results
print("Cross-Validation Scores on Test Data:", cv_scores)
print("Mean Cross-Validation Score:", cv_scores.mean())
print("Standard Deviation of CV Scores:", cv_scores.std())


Cross-Validation Scores on Test Data: [0.416589   0.416589   0.416589   0.416589   0.41697761]
Mean Cross-Validation Score: 0.41666672462477916
Standard Deviation of CV Scores: 0.0001554436577596796
