# **Data collection**

In [11]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix,classification_report
import pandas as pd



# Load the input (X) and output target data (Y)
X = pd.read_csv('/content/X.csv')
Y = pd.read_csv('/content/Y.csv')

# Check the first few rows of each to understand the structure
X.head(), Y.head()


(                                 ID  Column0  Column1  Column2   Column3  \
 0  982273a473c6b975daba8969983e4d1d      0.0     1902   6897.0       NaN   
 1  1adcc94821b675c9bff2140936920465      0.0     2495   3876.0  0.678139   
 2  d329b1c3ae56df6ca255f4e690879eb8      0.0      441   1970.0  0.678139   
 3  1fa819006a76a06625f63d0acd6ee3d7      0.0     1307    116.0  0.001462   
 4  ce3ff871e4f9a8bca9f05a420d575aa1      0.0     1559   2501.0  0.678139   
 
     Column4   Column5   Column6   Column7   Column8  ...  Column12  Column13  \
 0       NaN       NaN -0.407939 -0.015607 -0.774979  ...         0         0   
 1  0.701403 -0.007468 -0.371375 -0.015606  0.488362  ...         0         0   
 2 -0.577162 -0.007469 -0.407939 -0.015607 -0.774979  ...         1         1   
 3  0.062121 -0.007469 -0.407939 -0.015607 -0.774979  ...         1         1   
 4  0.701403 -0.007468 -0.407939 -0.015607 -0.050910  ...         1         0   
 
    Column14  Column15  Column16  Column17  Colu

# **Data splitting**

In [5]:
from sklearn.model_selection import train_test_split

data = pd.merge(X, Y, on='ID')

data = data.drop(columns=['ID'])

X = data.drop(columns=['target'])
Y = data['target']

# Split the data into training and testing sets (80:20 ratio)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, Y_train.shape, Y_test.shape


((41873, 22), (10469, 22), (41873,), (10469,))

# **pipeline with scaling, imputation, and logistic regression**

In [6]:

pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Feature scaling
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with the mean
    ('logistic_regression', LogisticRegression(random_state=42, max_iter=1000))  # Logistic Regression model
])

#Hyperparameter tuning using GridSearchCV
param_grid = {
    'logistic_regression__C': [0.1, 1, 10, 100],  # Regularization strength
    'logistic_regression__solver': ['liblinear', 'saga']  # Solver variations
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, Y_train)



# **Evaluate the best model found by GridSearchCV**

In [14]:
best_model = grid_search.best_estimator_
Y_pred = best_model.predict(X_test)

accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred)
recall = recall_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)
conf_matrix = confusion_matrix(Y_test, Y_pred)


print(f'Best Hyperparameters: {grid_search.best_params_}')

print('Confusion Matrix:')
print(conf_matrix)

report = classification_report(Y_test, Y_pred)
print('Classification Report:')
print(report)

cross_val_scores = cross_val_score(best_model, X_train, Y_train, cv=5)
print(f'Cross-Validation Accuracy Scores: {cross_val_scores}')
print(f'Mean Cross-Validation Accuracy: {cross_val_scores.mean()}')

Best Hyperparameters: {'logistic_regression__C': 10, 'logistic_regression__solver': 'liblinear'}
Confusion Matrix:
[[9277  241]
 [ 112  839]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      9518
           1       0.78      0.88      0.83       951

    accuracy                           0.97     10469
   macro avg       0.88      0.93      0.90     10469
weighted avg       0.97      0.97      0.97     10469

Cross-Validation Accuracy Scores: [0.97050746 0.96823881 0.9678806  0.96859326 0.97122044]
Mean Cross-Validation Accuracy: 0.9692881149542473
