### Import libs

In [1]:
#!pip install numpy pandas scikit-learn seaborn matplotlib

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Preprocess libs
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
# Model
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

#Evaluation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

### 1. Load Dataset and summarize it

In [None]:
# Load the dataset (Change 'dataset.csv' to your actual file)
df = pd.read_csv("dataset.csv")

# Display the first 5 rows
print("First 5 rows of the dataset:\n", df.head())

# Dataset information (data types, missing values, etc.)
print("\nDataset Info:")
print(df.info())

# Summary statistics (Numerical columns)
print("\nSummary Statistics:")
print(df.describe())

# Checking for missing values
print("\nMissing values in each column:\n", df.isnull().sum())

# Correlation heatmap (for numerical features)
plt.figure(figsize=(10,6))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.title("Feature Correlation Heatmap")
plt.show()


### 2. Preprocess data (Missing values and scaling)

In [None]:
# Handle missing values
df.fillna(df.mean(), inplace=True)  # Fill missing values in numerical columns
df.fillna(df.mode().iloc[0], inplace=True)  # Fill categorical columns

# Encode categorical columns
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

# Define Features (X) and Target (y)
X = df.drop("target", axis=1)  # Change 'target' to your actual target column name
y = df["target"]

# Split into training and testing datasets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling (for better model performance)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("\nData Preprocessing Completed ✅")

### 3. Model training

In [None]:
# Select model type
if y.nunique() > 10:  # If target is numerical, use Regression
    print("\nApplying Regression Model")
    model = LinearRegression()
else:  # If target is categorical, use Classification
    print("\nApplying Classification Model")
    model = LogisticRegression()

# Train the model
model.fit(X_train, y_train)

print("\nModel Training Completed ✅")

### 4. Evaluation of model

In [None]:
# Make predictions
y_pred = model.predict(X_test)

# Evaluate performance
if y.nunique() > 10:  # Regression Evaluation
    print("\nRegression Model Evaluation:")
    print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
    print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
    print("R² Score:", r2_score(y_test, y_pred))
else:  # Classification Evaluation
    print("\nClassification Model Evaluation:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

### 5. Inference