In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
# Step 1: Load the Data
file_path = '/Users/rohanpadaya/Desktop/pankit_assign/Stock_data_part1.csv'  
stock_data_1 = pd.read_csv(file_path, low_memory=False)


In [3]:
stock_data_1.head()

Unnamed: 0,PERMNO,date,SHRCD,TICKER,COMNAM,PERMCO,BIDLO,ASKHI,PRC,VOL,RET,BID,ASK,SHROUT,OPENPRC,NUMTRD,sprtrn
0,10026,2019-08-20,11.0,JJSF,J & J SNACK FOODS CORP,7976,190.97,196.74001,191.13,136698.0,-0.020298,191.0,191.13,18841.0,194.7,1903.0,-0.007915
1,10026,2019-08-21,11.0,JJSF,J & J SNACK FOODS CORP,7976,188.5025,192.56,189.35001,101583.0,-0.009313,189.17,189.35001,18841.0,191.98,2252.0,0.008247
2,10026,2019-08-22,11.0,JJSF,J & J SNACK FOODS CORP,7976,187.98,190.39,189.32001,92198.0,-0.000158,189.32001,189.5,18841.0,188.89999,1805.0,-0.000506
3,10026,2019-08-23,11.0,JJSF,J & J SNACK FOODS CORP,7976,185.59,190.35001,186.14999,75522.0,-0.016744,186.14,186.28,18841.0,189.22,1629.0,-0.025946
4,10026,2019-08-26,11.0,JJSF,J & J SNACK FOODS CORP,7976,186.7,191.39999,191.23,81788.0,0.02729,191.03999,191.25,18841.0,187.11,2070.0,0.010983


In [4]:
# Step 2: Convert RET to numeric, coercing errors
stock_data_1['RET'] = pd.to_numeric(stock_data_1['RET'], errors='coerce')

# Check for any other object types that should be numeric
stock_data_1['SHRCD'] = pd.to_numeric(stock_data_1['SHRCD'], errors='coerce')

# Step 3: Convert 'date' column to datetime
stock_data_1['date'] = pd.to_datetime(stock_data_1['date'])


In [5]:
stock_data_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1938801 entries, 0 to 1938800
Data columns (total 17 columns):
 #   Column   Dtype         
---  ------   -----         
 0   PERMNO   int64         
 1   date     datetime64[ns]
 2   SHRCD    float64       
 3   TICKER   object        
 4   COMNAM   object        
 5   PERMCO   int64         
 6   BIDLO    float64       
 7   ASKHI    float64       
 8   PRC      float64       
 9   VOL      float64       
 10  RET      float64       
 11  BID      float64       
 12  ASK      float64       
 13  SHROUT   float64       
 14  OPENPRC  float64       
 15  NUMTRD   float64       
 16  sprtrn   float64       
dtypes: datetime64[ns](1), float64(12), int64(2), object(2)
memory usage: 251.5+ MB


In [6]:
# Drop rows with NaN in critical columns (RET, PRC, BID, ASK)
stock_data_1_cleaned = stock_data_1.dropna(subset=['RET', 'PRC', 'BID', 'ASK', 'TICKER'])

# Drop the NUMTRD column due to excessive missing values
stock_data_1_cleaned = stock_data_1_cleaned.drop(columns=['NUMTRD'])

# Consider dropping or filling OPENPRC based on its necessity
# Example: Drop OPENPRC if not essential
stock_data_1_cleaned = stock_data_1_cleaned.drop(columns=['OPENPRC'])

# Handle remaining columns with moderate NaN values
# Example: Drop rows with missing BIDLO, ASKHI, VOL
stock_data_1_cleaned = stock_data_1_cleaned.dropna(subset=['BIDLO', 'ASKHI', 'VOL'])

# Optionally fill missing values in columns with very few NaNs, like SHRCD, COMNAM, and SHROUT
# For example, filling with the mode or a placeholder value
stock_data_1_cleaned['SHRCD'].fillna(stock_data_1_cleaned['SHRCD'].mode()[0], inplace=True)
stock_data_1_cleaned['COMNAM'].fillna("Unknown", inplace=True)
stock_data_1_cleaned['SHROUT'].fillna(stock_data_1_cleaned['SHROUT'].median(), inplace=True)


In [7]:
# Check the shape of the cleaned data
print("Shape of the cleaned data:", stock_data_1_cleaned.shape)


Shape of the cleaned data: (1926738, 15)


In [8]:
nan_counts_2 = stock_data_1_cleaned.isna().sum()

# Display the number of NaN values for each column
print("Number of NaN values in each column:")
print(nan_counts_2)

Number of NaN values in each column:
PERMNO    0
date      0
SHRCD     0
TICKER    0
COMNAM    0
PERMCO    0
BIDLO     0
ASKHI     0
PRC       0
VOL       0
RET       0
BID       0
ASK       0
SHROUT    0
sprtrn    0
dtype: int64


In [9]:
stock_data_1_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1926738 entries, 0 to 1938800
Data columns (total 15 columns):
 #   Column  Dtype         
---  ------  -----         
 0   PERMNO  int64         
 1   date    datetime64[ns]
 2   SHRCD   float64       
 3   TICKER  object        
 4   COMNAM  object        
 5   PERMCO  int64         
 6   BIDLO   float64       
 7   ASKHI   float64       
 8   PRC     float64       
 9   VOL     float64       
 10  RET     float64       
 11  BID     float64       
 12  ASK     float64       
 13  SHROUT  float64       
 14  sprtrn  float64       
dtypes: datetime64[ns](1), float64(10), int64(2), object(2)
memory usage: 235.2+ MB


In [10]:
stock_data_1_cleaned.describe()

Unnamed: 0,PERMNO,date,SHRCD,PERMCO,BIDLO,ASKHI,PRC,VOL,RET,BID,ASK,SHROUT,sprtrn
count,1926738.0,1926738,1926738.0,1926738.0,1926738.0,1926738.0,1926738.0,1926738.0,1926738.0,1926738.0,1926738.0,1926738.0,1926738.0
mean,46621.52,2020-02-19 12:30:49.444605440,32.39542,41875.18,78.59034,80.30824,78.38405,1240891.0,0.0008651649,79.39028,79.55532,99728.11,0.0007987086
min,10026.0,2019-08-20 00:00:00,11.0,5.0,0.0003,0.026,-652.53,0.0,-0.878571,0.01,0.0229,9.0,-0.119841
25%,15973.0,2019-11-18 00:00:00,11.0,30492.0,8.85,9.3,8.13,20083.0,-0.011527,9.03,9.1,7000.0,-0.004011
50%,19007.0,2020-02-20 00:00:00,12.0,52127.0,21.55,22.16445,21.25,147858.0,0.000336,21.81,21.92,27471.0,0.00187
75%,86536.0,2020-05-21 00:00:00,73.0,54987.0,42.46,43.6599,42.68,698692.2,0.011823,43.01,43.11,75468.0,0.007695
max,93436.0,2020-08-20 00:00:00,75.0,57370.0,344550.0,347400.0,344970.0,1003256000.0,10.25182,345140.0,345589.0,9308301.0,0.093828
std,34335.17,,27.63045,17090.54,3485.893,3538.367,3512.855,5868059.0,0.05131279,3511.437,3516.463,336067.5,0.02085427


In [11]:
import pandas as pd

# Filter data for the start and end dates
start_date = '2020-02-14'
end_date = '2020-03-20'

# Extract prices at the start and end dates
prc_start = stock_data_1_cleaned[stock_data_1_cleaned['date'] == start_date][['PERMNO', 'PRC']].rename(columns={'PRC': 'PRC_start'})
prc_end = stock_data_1_cleaned[stock_data_1_cleaned['date'] == end_date][['PERMNO', 'PRC']].rename(columns={'PRC': 'PRC_end'})

# Merge start and end prices on PERMNO
price_change_df = pd.merge(prc_start, prc_end, on='PERMNO', how='inner')

# Create the 'price_increased' column (1 if price increased, otherwise 0)
price_change_df['price_increased'] = (price_change_df['PRC_end'] > price_change_df['PRC_start']).astype(int)

# Verify the created price_increased column
print(price_change_df.head())


   PERMNO  PRC_start  PRC_end  price_increased
0   10026     173.59   123.80                0
1   10028       2.38     2.60                1
2   10032      73.27    43.00                0
3   10044       8.48     3.77                0
4   10051      23.88    13.00                0


In [12]:
price_change_df

Unnamed: 0,PERMNO,PRC_start,PRC_end,price_increased
0,10026,173.59000,123.80,0
1,10028,2.38000,2.60,1
2,10032,73.27000,43.00,0
3,10044,8.48000,3.77,0
4,10051,23.88000,13.00,0
...,...,...,...,...
7537,93426,33.77000,17.95,0
7538,93427,64.99000,47.78,0
7539,93429,125.63000,79.69,0
7540,93434,2.27000,2.19,0


In [13]:
# Count the number of price increases
price_increases = price_change_df['price_increased'].sum()

# Count the total number of records
total_records = price_change_df['price_increased'].count()

# Print the results
print(f"Number of price increases: {price_increases}")
print(f"Total number of records: {total_records}")
print(f"Percentage of records with price increase: {price_increases / total_records * 100:.2f}%")


Number of price increases: 465
Total number of records: 7542
Percentage of records with price increase: 6.17%


In [14]:
# Merge the price_change_df with the main dataset to get the price_increased column
stock_data_1_cleaned = pd.merge(stock_data_1_cleaned, price_change_df[['PERMNO', 'price_increased']], on='PERMNO', how='left')

# Fill NaN values in price_increased with 0 (indicating no price increase)
stock_data_1_cleaned['price_increased'].fillna(0, inplace=True)

# Verify that the merge and fill are correct
print(stock_data_1_cleaned[['PERMNO', 'date', 'PRC', 'price_increased']].dropna().head())


   PERMNO       date        PRC  price_increased
0   10026 2019-08-20  191.13000              0.0
1   10026 2019-08-21  189.35001              0.0
2   10026 2019-08-22  189.32001              0.0
3   10026 2019-08-23  186.14999              0.0
4   10026 2019-08-26  191.23000              0.0


In [15]:
# Check the unique values in the 'price_increased' column
unique_values = stock_data_1_cleaned['price_increased'].unique()
print(f"Unique values in 'price_increased': {unique_values}")

# Ensure there are no NaN values in the 'price_increased' column
num_nans = stock_data_1_cleaned['price_increased'].isna().sum()
print(f"Number of NaN values in 'price_increased': {num_nans}")

# Count the number of records for each value (0 or 1) in 'price_increased'
value_counts = stock_data_1_cleaned['price_increased'].value_counts()
print("Count of each value in 'price_increased':")
print(value_counts)

# Display the first few rows to visually verify the values in 'price_increased'
print(stock_data_1_cleaned[['PERMNO', 'date', 'PRC', 'price_increased']].head())


Unique values in 'price_increased': [0. 1.]
Number of NaN values in 'price_increased': 0
Count of each value in 'price_increased':
price_increased
0.0    1814803
1.0     111935
Name: count, dtype: int64
   PERMNO       date        PRC  price_increased
0   10026 2019-08-20  191.13000              0.0
1   10026 2019-08-21  189.35001              0.0
2   10026 2019-08-22  189.32001              0.0
3   10026 2019-08-23  186.14999              0.0
4   10026 2019-08-26  191.23000              0.0


In [16]:
from sklearn.model_selection import train_test_split

# Select the relevant features
features = ['BIDLO', 'ASKHI', 'PRC', 'VOL', 'RET', 'BID', 'ASK', 'SHROUT', 'sprtrn', 'SHRCD']
X = stock_data_1_cleaned[features]
y = stock_data_1_cleaned['price_increased']

# Split the data into training and testing sets (stratify to maintain balance in target variable)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [17]:
from sklearn.preprocessing import StandardScaler

# Normalize the continuous features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert scaled arrays back to DataFrame
X_train_scaled = pd.DataFrame(X_train_scaled, columns=features, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=features, index=X_test.index)


In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

# Fit the logistic regression model
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = logreg.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9416916657151458
Precision: 0.29901960784313725
Recall: 0.0027247956403269754

Classification Report:
               precision    recall  f1-score   support

         0.0       0.94      1.00      0.97    362961
         1.0       0.30      0.00      0.01     22387

    accuracy                           0.94    385348
   macro avg       0.62      0.50      0.49    385348
weighted avg       0.90      0.94      0.91    385348



In [19]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

def evaluate_model(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [20]:
from sklearn.linear_model import LogisticRegression

# Logistic Regression with regularization
logreg = LogisticRegression(random_state=42, solver='liblinear')
logreg.fit(X_train_scaled, y_train)
y_pred_logreg = logreg.predict(X_test_scaled)

# Evaluate the Logistic Regression model
print("Logistic Regression:")
evaluate_model(y_test, y_pred_logreg)


Logistic Regression:
Accuracy: 0.9416916657151458
Precision: 0.29901960784313725
Recall: 0.0027247956403269754

Classification Report:
               precision    recall  f1-score   support

         0.0       0.94      1.00      0.97    362961
         1.0       0.30      0.00      0.01     22387

    accuracy                           0.94    385348
   macro avg       0.62      0.50      0.49    385348
weighted avg       0.90      0.94      0.91    385348



In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

# Assume 'filtered_data' is your data after processing
# Sample 10% of the data
sample_data = stock_data_1_cleaned.sample(frac=0.1, random_state=42)

# Select features and target
features = ['BIDLO', 'ASKHI', 'PRC', 'VOL', 'RET', 'BID', 'ASK', 'SHROUT', 'sprtrn', 'SHRCD']
X_sample = sample_data[features]
y_sample = sample_data['price_increased']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42, stratify=y_sample)

# Normalize the continuous features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert scaled arrays back to DataFrame
X_train_scaled = pd.DataFrame(X_train_scaled, columns=features, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=features, index=X_test.index)


In [22]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

# Function to evaluate models
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    print(f'Model: {model.__class__.__name__}')
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    print('-' * 50)


In [23]:
# Run Random Forest Classifier
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
evaluate_model(rf, X_train_scaled, y_train, X_test_scaled, y_test)


Model: RandomForestClassifier
Accuracy: 0.956455170624108
Precision: 0.8404384896467723
Recall: 0.30844881537773805

Classification Report:
               precision    recall  f1-score   support

         0.0       0.96      1.00      0.98     36298
         1.0       0.84      0.31      0.45      2237

    accuracy                           0.96     38535
   macro avg       0.90      0.65      0.71     38535
weighted avg       0.95      0.96      0.95     38535

--------------------------------------------------


In [24]:
# Run Gradient Boosting Classifier
gb = GradientBoostingClassifier(random_state=42)
evaluate_model(gb, X_train_scaled, y_train, X_test_scaled, y_test)


Model: GradientBoostingClassifier
Accuracy: 0.9451926819774231
Precision: 0.6971608832807571
Recall: 0.09879302637460885

Classification Report:
               precision    recall  f1-score   support

         0.0       0.95      1.00      0.97     36298
         1.0       0.70      0.10      0.17      2237

    accuracy                           0.95     38535
   macro avg       0.82      0.55      0.57     38535
weighted avg       0.93      0.95      0.93     38535

--------------------------------------------------


In [25]:
# Run K-Nearest Neighbors
knn = KNeighborsClassifier(n_jobs=-1)
evaluate_model(knn, X_train_scaled, y_train, X_test_scaled, y_test)


Model: KNeighborsClassifier
Accuracy: 0.9405475541715324
Precision: 0.43601895734597157
Recall: 0.08225301743406348

Classification Report:
               precision    recall  f1-score   support

         0.0       0.95      0.99      0.97     36298
         1.0       0.44      0.08      0.14      2237

    accuracy                           0.94     38535
   macro avg       0.69      0.54      0.55     38535
weighted avg       0.92      0.94      0.92     38535

--------------------------------------------------


In [26]:
from sklearn.svm import SVC

# SVM with linear kernel and limited iterations
svm = SVC(kernel='linear', max_iter=1000, random_state=42)
evaluate_model(svm, X_train_scaled, y_train, X_test_scaled, y_test)




Model: SVC
Accuracy: 0.1444401193719995
Precision: 0.058219768845954804
Recall: 0.9052302190433617

Classification Report:
               precision    recall  f1-score   support

         0.0       0.94      0.10      0.18     36298
         1.0       0.06      0.91      0.11      2237

    accuracy                           0.14     38535
   macro avg       0.50      0.50      0.14     38535
weighted avg       0.89      0.14      0.17     38535

--------------------------------------------------


In [30]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Assume you have your filtered_data ready from previous steps

# Use a smaller sample of data for quick testing
sampled_data = stock_data_1_cleaned.sample(n=10000, random_state=42)

# Split the sample data into features and target
X_sample = sampled_data[features]
y_sample = sampled_data['price_increased']

# Split into training and testing sets
X_train_sample, X_test_sample, y_train_sample, y_test_sample = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42, stratify=y_sample)

# Normalize the continuous features
scaler = StandardScaler()
X_train_sample_scaled = scaler.fit_transform(X_train_sample)
X_test_sample_scaled = scaler.transform(X_test_sample)


In [33]:
# Update the models to include class weights
logreg = LogisticRegression(random_state=42, class_weight='balanced')
rf = RandomForestClassifier(random_state=42, class_weight='balanced')
knn = KNeighborsClassifier()  # KNN does not have class weights
svc = SVC(probability=True, random_state=42, class_weight='balanced')

# Create a Voting Classifier with soft voting and class-weighted models
voting_clf = VotingClassifier(
    estimators=[('lr', logreg), ('rf', rf), ('knn', knn), ('svc', svc)],
    voting='soft'
)

# Fit the Voting Classifier on the training data
voting_clf.fit(X_train_sample_scaled, y_train_sample)

# Make predictions on the test set
y_pred_voting = voting_clf.predict(X_test_sample_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test_sample, y_pred_voting)
precision = precision_score(y_test_sample, y_pred_voting)
recall = recall_score(y_test_sample, y_pred_voting)

print(f'Ensemble Voting Classifier with Class Weights - Accuracy: {accuracy}')
print(f'Ensemble Voting Classifier with Class Weights - Precision: {precision}')
print(f'Ensemble Voting Classifier with Class Weights - Recall: {recall}')
print("\nClassification Report:\n", classification_report(y_test_sample, y_pred_voting))


Ensemble Voting Classifier with Class Weights - Accuracy: 0.9475
Ensemble Voting Classifier with Class Weights - Precision: 0.5
Ensemble Voting Classifier with Class Weights - Recall: 0.06666666666666667

Classification Report:
               precision    recall  f1-score   support

         0.0       0.95      1.00      0.97      1895
         1.0       0.50      0.07      0.12       105

    accuracy                           0.95      2000
   macro avg       0.73      0.53      0.55      2000
weighted avg       0.93      0.95      0.93      2000



In [34]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_sample_resampled, y_train_sample_resampled = smote.fit_resample(X_train_sample_scaled, y_train_sample)

# Fit the Voting Classifier on the resampled training data
voting_clf.fit(X_train_sample_resampled, y_train_sample_resampled)

# Make predictions on the test set
y_pred_voting = voting_clf.predict(X_test_sample_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test_sample, y_pred_voting)
precision = precision_score(y_test_sample, y_pred_voting)
recall = recall_score(y_test_sample, y_pred_voting)

print(f'Ensemble Voting Classifier with SMOTE - Accuracy: {accuracy}')
print(f'Ensemble Voting Classifier with SMOTE - Precision: {precision}')
print(f'Ensemble Voting Classifier with SMOTE - Recall: {recall}')
print("\nClassification Report:\n", classification_report(y_test_sample, y_pred_voting))


Ensemble Voting Classifier with SMOTE - Accuracy: 0.838
Ensemble Voting Classifier with SMOTE - Precision: 0.15457413249211358
Ensemble Voting Classifier with SMOTE - Recall: 0.4666666666666667

Classification Report:
               precision    recall  f1-score   support

         0.0       0.97      0.86      0.91      1895
         1.0       0.15      0.47      0.23       105

    accuracy                           0.84      2000
   macro avg       0.56      0.66      0.57      2000
weighted avg       0.92      0.84      0.87      2000



In [35]:
import pandas as pd

# Create a DataFrame to store the metrics
model_metrics = pd.DataFrame({
    'Model': [
        'Logistic Regression', 
        'Random Forest', 
        'Gradient Boosting', 
        'K-Nearest Neighbors', 
        'Support Vector Machine',
        'Voting Classifier',
        'Voting Classifier with Class Weights',
        'Voting Classifier with SMOTE'
    ],
    'Accuracy': [
        0.9417,  # Logistic Regression
        0.9565,  # Random Forest
        0.9454,  # Gradient Boosting
        0.9405,  # K-Nearest Neighbors
        0.1444,  # Support Vector Machine
        0.9475,  # Voting Classifier
        0.9475,  # Voting Classifier with Class Weights
        0.8380   # Voting Classifier with SMOTE
    ],
    'Precision': [
        0.2990,  # Logistic Regression
        0.8404,  # Random Forest
        0.6863,  # Gradient Boosting
        0.4360,  # K-Nearest Neighbors
        0.0582,  # Support Vector Machine
        0.5000,  # Voting Classifier
        0.5000,  # Voting Classifier with Class Weights
        0.1546   # Voting Classifier with SMOTE
    ],
    'Recall': [
        0.0027,  # Logistic Regression
        0.3084,  # Random Forest
        0.1121,  # Gradient Boosting
        0.0823,  # K-Nearest Neighbors
        0.9052,  # Support Vector Machine
        0.0667,  # Voting Classifier
        0.0667,  # Voting Classifier with Class Weights
        0.4667   # Voting Classifier with SMOTE
    ]
})

# Display the table
print(model_metrics)


                                  Model  Accuracy  Precision  Recall
0                   Logistic Regression    0.9417     0.2990  0.0027
1                         Random Forest    0.9565     0.8404  0.3084
2                     Gradient Boosting    0.9454     0.6863  0.1121
3                   K-Nearest Neighbors    0.9405     0.4360  0.0823
4                Support Vector Machine    0.1444     0.0582  0.9052
5                     Voting Classifier    0.9475     0.5000  0.0667
6  Voting Classifier with Class Weights    0.9475     0.5000  0.0667
7          Voting Classifier with SMOTE    0.8380     0.1546  0.4667
