In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os

In [17]:
# Step 1: Load dataset
df = pd.read_csv("../Data/company_top4_features.csv")

In [18]:
# Step 2: Separate features and target
X = df.drop(columns=['status_label'])
y = df['status_label']

In [19]:
# Step 3: Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert scaled features back to DataFrame for saving
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

In [20]:
# Step 5: Save all splits to CSVs
output_dir = "../Data/preprocessed"
os.makedirs(output_dir, exist_ok=True)

X_train.to_csv(os.path.join(output_dir, "X_train.csv"), index=False)
X_test.to_csv(os.path.join(output_dir, "X_test.csv"), index=False)
y_train.to_csv(os.path.join(output_dir, "y_train.csv"), index=False)
y_test.to_csv(os.path.join(output_dir, "y_test.csv"), index=False)

In [21]:
# Step 6: Print status
print("✅ Preprocessing complete and saved as CSV.")
print(f"🔹 X_train shape: {X_train.shape}")
print(f"🔹 X_test shape: {X_test.shape}")
print(f"🔹 y_train shape: {y_train.shape}")
print(f"🔹 y_test shape: {y_test.shape}")

✅ Preprocessing complete and saved as CSV.
🔹 X_train shape: (2400, 4)
🔹 X_test shape: (600, 4)
🔹 y_train shape: (2400,)
🔹 y_test shape: (600,)


In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os
import joblib

In [23]:
# Step 1: Load dataset
df = pd.read_csv("../Data/company_top4_features.csv")

In [24]:
# Step 2: Drop duplicates
df.drop_duplicates(inplace=True)

In [25]:
# Step 3: Check and fill missing values
if df.isnull().sum().any():
    print("❗ Missing values found. Filling with mean.")
    df.fillna(df.mean(), inplace=True)
else:
    print("✅ No missing values.")

✅ No missing values.


In [26]:
# Step 4: Shuffle the dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [27]:
# Step 5: Separate features and target
X = df.drop(columns=['status_label'])
y = df['status_label']

In [28]:
# Step 6: Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [29]:
# Step 7: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)


In [30]:
# Step 8: Save as CSV (optional: for later inspection)
os.makedirs("../Data/preprocessed", exist_ok=True)

pd.DataFrame(X_train, columns=X.columns).to_csv("../Data/preprocessed/X_train.csv", index=False)
pd.DataFrame(X_test, columns=X.columns).to_csv("../Data/preprocessed/X_test.csv", index=False)
pd.DataFrame(y_train).to_csv("../Data/preprocessed/y_train.csv", index=False)
pd.DataFrame(y_test).to_csv("../Data/preprocessed/y_test.csv", index=False)

print("✅ Preprocessed CSV files saved in: Data/preprocessed/")

✅ Preprocessed CSV files saved in: Data/preprocessed/


In [31]:
# Step 9: Save the scaler so app.py can load it
os.makedirs("../App/model", exist_ok=True)
joblib.dump(scaler, "../App/model/scaler.pkl")
print("✅ scaler.pkl saved in App/model/")

✅ scaler.pkl saved in App/model/
