In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt


# Load your dataset (replace 'data.csv' with your dataset file)
data = pd.read_csv('newestest_itineraries4.csv')

# Extract the feature labels from the first row
feature_labels = data.columns[1:].values  # Exclude the first column (target) for feature labels

# Set the first column as the target variable
X = data.iloc[:, 1:]  # Exclude the first column (target) for features
y = data.iloc[:, 0]  # Use the first column as the target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an initial model (you can choose a different model based on your dataset)
model = RandomForestClassifier()

# Store the original feature list
remaining_features = list(X.columns)

# Define variables to store feature rankings and their corresponding scores
feature_rankings = {}
best_features = []

# Set the number of features you want to select (in this case, 10)
num_features_to_select = 10

while len(remaining_features) > num_features_to_select:
    scores = []

    # Train and evaluate the model with each remaining feature removed one at a time
    for feature in remaining_features:
        features_to_use = best_features + [feature]

        # Fit the model with the selected features
        model.fit(X_train[features_to_use], y_train)

        # Evaluate the model on the test set
        score = model.score(X_test[features_to_use], y_test)
        scores.append((feature, score))

    # Find the feature with the lowest score (least important)
    worst_feature, worst_score = min(scores, key=lambda x: x[1])

    # Remove the least important feature from the remaining features
    remaining_features.remove(worst_feature)

    # Store the feature ranking
    feature_rankings[worst_feature] = worst_score

    # Store the best features so far
    best_features = [f for f in best_features if f != worst_feature]

# Select the top N features based on their scores
selected_features = list(sorted(feature_rankings, key=feature_rankings.get, reverse=True)[:num_features_to_select])

print("Selected features:", selected_features)




# Sort the feature rankings by importance score
sorted_feature_rankings = {k: v for k, v in sorted(feature_rankings.items(), key=lambda item: item[1])}

# Create a bar plot
plt.figure(figsize=(10, 6))
plt.barh(range(len(sorted_feature_rankings)), list(sorted_feature_rankings.values()), align='center')
plt.yticks(range(len(sorted_feature_rankings)), list(sorted_feature_rankings.keys()))
plt.xlabel('Model Score (Higher is better)')
plt.title('Feature Importance Scores in Backward Feature Selection')
plt.show()



ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.