In [17]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score

# Load and preprocess the data
def load_and_preprocess_data(csv_file):
    # Load the dataset
    df = pd.read_csv(csv_file)
    df = df.drop('path_to_video', axis=1)

    # Check for missing values and handle them
    df.fillna(0, inplace=True)  # Replace missing values with 0 (or another suitable method)

    # Feature columns (all except 'label' column)
    features = df.drop('label', axis=1)

    # Target column
    target = df['label']

    # Encode labels (if 'label' is categorical)
    label_encoder = LabelEncoder()
    target = label_encoder.fit_transform(target)

    # Feature scaling
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)

    return features_scaled, target

# Define a simple neural network model
class NeuralNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)  # Input to hidden layer
        self.fc2 = nn.Linear(hidden_size, output_size)  # Hidden to output layer
        self.relu = nn.ReLU()  # Activation function
        self.softmax = nn.Softmax(dim=1)  # Softmax for multi-class classification

    def forward(self, x):
        x = self.fc1(x)  # Pass input through first layer
        x = self.relu(x)  # Apply activation function
        x = self.fc2(x)  # Pass through second layer
        x = self.softmax(x)  # Apply softmax to get probabilities
        return x

# Train and evaluate the neural network
def train_and_evaluate_neural_network(X_train, X_test, y_train, y_test, input_size, output_size):
    # Convert to PyTorch tensors
    train_data = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.long))
    test_data = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.long))

    # DataLoader for batching
    train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
    test_loader = DataLoader(test_data, batch_size=64, shuffle=False)

    # Model initialization
    model = NeuralNetwork(input_size=input_size, hidden_size=64, output_size=output_size)

    # Loss function and optimizer
    criterion = nn.CrossEntropyLoss()  # For multi-class classification
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Training loop
    for epoch in range(3000):  # You can increase the number of epochs for better results
        model.train()
        running_loss = 0.0
        for inputs, labels in train_loader:
            # Zero the gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            # Backward pass
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        # Print the training loss for every epoch
        print(f"Epoch {epoch+1}, Loss: {running_loss / len(train_loader)}")

    # Evaluation
    model.eval()
    y_pred = []
    y_true = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            y_pred.extend(predicted.numpy())
            y_true.extend(labels.numpy())

    # Calculate accuracy
    accuracy = accuracy_score(y_true, y_pred)
    print(f"Accuracy: {accuracy * 100:.2f}%")

# Main function
def main(csv_file):
    # Load and preprocess the data
    features_scaled, target = load_and_preprocess_data(csv_file)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42)

    # Define input and output sizes
    input_size = X_train.shape[1]  # Number of features
    output_size = len(set(y_train))  # Number of unique labels

    # Train and evaluate the neural network
    train_and_evaluate_neural_network(X_train, X_test, y_train, y_test, input_size, output_size)

if __name__ == "__main__":
    # Take the CSV filename as input
    csv_file = "d2_3_model_dataset.csv"  # Change to your file path
    main(csv_file)


Epoch 1, Loss: 2.5637348095575967
Epoch 2, Loss: 2.5615748365720115
Epoch 3, Loss: 2.5590114990870156
Epoch 4, Loss: 2.5563306411107383
Epoch 5, Loss: 2.552879750728607
Epoch 6, Loss: 2.548805356025696
Epoch 7, Loss: 2.544330875078837
Epoch 8, Loss: 2.5395086208979287
Epoch 9, Loss: 2.5343643029530845
Epoch 10, Loss: 2.529638727506002
Epoch 11, Loss: 2.5248722235361734
Epoch 12, Loss: 2.520796259244283
Epoch 13, Loss: 2.517789642016093
Epoch 14, Loss: 2.5151509443918862
Epoch 15, Loss: 2.513604243596395
Epoch 16, Loss: 2.513405442237854
Epoch 17, Loss: 2.512046496073405
Epoch 18, Loss: 2.511172612508138
Epoch 19, Loss: 2.5105121731758118
Epoch 20, Loss: 2.5102263490358987
Epoch 21, Loss: 2.5101393858591714
Epoch 22, Loss: 2.509417454401652
Epoch 23, Loss: 2.5097548961639404
Epoch 24, Loss: 2.5089226762453714
Epoch 25, Loss: 2.509220023949941
Epoch 26, Loss: 2.5088528593381247
Epoch 27, Loss: 2.5086361368497214
Epoch 28, Loss: 2.5088334878285727
Epoch 29, Loss: 2.5082243283589682
Epoch 

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

def load_and_preprocess_data(csv_file):
    # Load the dataset
    df = pd.read_csv(csv_file)
    df = df.drop('path_to_video', axis=1)

    # Check for missing values and handle them
    df.fillna(0, inplace=True)  # Replace missing values with 0 (or another suitable method)

    # Feature columns (all except 'label' column)
    features = df.drop('label', axis=1)

    # Target column
    target = df['label']

    # Encode labels (if 'label' is categorical)
    label_encoder = LabelEncoder()
    target = label_encoder.fit_transform(target)

    # Feature scaling
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)

    return features_scaled, target

def train_and_evaluate_models(X_train, X_test, y_train, y_test):
    models = {
        'Logistic Regression': LogisticRegression(),
        'Random Forest': RandomForestClassifier(),
        'SVM': SVC(),
        'KNN': KNeighborsClassifier(),
        'XGBoost': XGBClassifier(),
        'Bagging (Logistic Regression)': BaggingClassifier(estimator=LogisticRegression()),
        'Bagging (Random Forest)': BaggingClassifier(estimator=RandomForestClassifier()),
        'AdaBoost (Logistic Regression)': AdaBoostClassifier(estimator=LogisticRegression()),
        'AdaBoost (Random Forest)': AdaBoostClassifier(estimator=RandomForestClassifier()),
        'Gradient Boosting': GradientBoostingClassifier()
    }

    for model_name, model in models.items():
        print(f"Training {model_name}...")
        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Evaluate and print accuracy
        accuracy = accuracy_score(y_test, y_pred)
        print(f"{model_name} Accuracy: {accuracy:.4f}")

def main(csv_file):
    # Load and preprocess the data
    features_scaled, target = load_and_preprocess_data(csv_file)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42)

    # Train and evaluate all models
    train_and_evaluate_models(X_train, X_test, y_train, y_test)

if __name__ == "__main__":
    # Take the CSV filename as input
    csv_file = "d2_model_dataset.csv"
    main(csv_file)


Training Logistic Regression...
Logistic Regression Accuracy: 0.5053
Training Random Forest...
Random Forest Accuracy: 0.5158
Training SVM...
SVM Accuracy: 0.5158
Training KNN...
KNN Accuracy: 0.4579
Training XGBoost...
XGBoost Accuracy: 0.5053
Training Bagging (Logistic Regression)...
Bagging (Logistic Regression) Accuracy: 0.5053
Training Bagging (Random Forest)...
Bagging (Random Forest) Accuracy: 0.5316
Training AdaBoost (Logistic Regression)...
AdaBoost (Logistic Regression) Accuracy: 0.5421
Training AdaBoost (Random Forest)...




AdaBoost (Random Forest) Accuracy: 0.5000
Training Gradient Boosting...
Gradient Boosting Accuracy: 0.4947
