In [6]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# Generate the dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=5, n_redundant=2, n_repeated=0, n_classes=2, random_state=42)

# Convert to DataFrame for ease of use
feature_names = [f'feature_{i}' for i in range(X.shape[1])]
X = pd.DataFrame(X, columns=feature_names)
y = pd.Series(y)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [7]:
X

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19
0,0.529210,-0.972022,-0.295516,0.672895,-2.140412,0.519489,-1.317799,1.759005,0.162605,1.447476,-0.325251,-0.282609,-1.955408,0.798477,0.824515,-2.090719,-0.334851,-0.603923,0.727371,-1.852710
1,0.535595,0.966618,0.170555,0.410832,0.537282,-0.967972,1.738992,0.418544,1.567679,0.871440,-0.958516,-0.511782,0.508186,0.193465,-0.177357,0.670755,-0.716473,-1.937975,-1.910503,0.748482
2,1.767060,0.327039,-0.303488,-0.925785,2.192353,0.244383,1.371141,0.443986,-0.042151,2.027416,-1.320659,0.567247,1.170551,-0.973415,0.411144,0.432581,0.889554,-0.280509,-0.926557,0.033526
3,2.853623,-0.235000,-0.944577,1.283783,0.496723,-0.474935,2.168761,-0.034107,1.901865,-0.016975,-2.504227,-0.652267,0.561371,0.957518,-0.285188,3.088123,2.601770,0.283617,-3.102956,-0.108371
4,0.810864,0.291173,0.937615,-1.167519,-0.790165,-1.592483,-1.024950,0.350575,1.191931,1.027453,-2.051382,1.319422,1.769745,-0.381271,0.901685,3.684520,0.520599,-0.150470,-5.084620,-0.302753
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,-0.088668,-0.810558,-2.242222,-0.265867,-1.142734,0.723973,-0.967865,-0.058947,0.120655,2.589444,-1.403526,0.898903,0.419401,0.813636,-0.231163,0.563229,0.241741,0.047034,-2.095307,-0.652022
996,-1.253805,0.521450,0.003975,0.305564,2.387438,0.456502,-1.573914,0.062260,-0.629057,-0.915646,0.796963,-0.250560,1.291395,-1.476464,1.092103,0.406067,-0.857739,-0.812465,1.180697,0.196742
997,-0.188366,-1.141359,-0.115483,-2.014538,0.325390,-0.120447,1.212384,-1.444330,0.098030,-1.963558,2.697508,-1.385534,1.170583,0.744590,-0.742280,-0.049831,2.045893,-0.995350,3.145994,-0.716169
998,-0.141926,-1.354691,-0.877001,-1.166829,-0.908797,-1.020155,1.327971,-0.537510,-0.429582,3.444892,1.554090,-0.715426,1.866809,2.135082,-0.578387,-1.478774,-0.902250,-0.522308,0.971001,-1.358010


In [8]:
from sklearn.ensemble import RandomForestClassifier

# Train the Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Evaluate the model
accuracy_before = rf.score(X_test, y_test)
print(f'Accuracy before feature selection: {accuracy_before:.2f}')

Accuracy before feature selection: 0.92


In [9]:
# Extract feature importances
importances = rf.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})

# Rank features by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

# Select top N features (example selecting top 10 features)
top_features = feature_importance_df['Feature'][:10].values
X_train_selected = X_train[top_features]
X_test_selected = X_test[top_features]

       Feature  Importance
10  feature_10    0.166347
18  feature_18    0.129780
9    feature_9    0.127592
15  feature_15    0.116865
4    feature_4    0.113428
12  feature_12    0.059363
1    feature_1    0.051482
14  feature_14    0.020885
3    feature_3    0.020203
11  feature_11    0.019620
2    feature_2    0.019236
17  feature_17    0.018607
5    feature_5    0.018271
6    feature_6    0.018121
7    feature_7    0.017843
8    feature_8    0.017514
0    feature_0    0.017097
16  feature_16    0.016739
13  feature_13    0.015980
19  feature_19    0.015027


In [12]:
top_features

array(['feature_10', 'feature_18', 'feature_9', 'feature_15', 'feature_4',
       'feature_12', 'feature_1', 'feature_14', 'feature_3', 'feature_11'],
      dtype=object)

In [13]:
X_train[top_features]

Unnamed: 0,feature_10,feature_18,feature_9,feature_15,feature_4,feature_12,feature_1,feature_14,feature_3,feature_11
541,-0.739524,1.031743,0.173312,-1.855734,-1.477693,-2.683307,-0.652959,0.041563,-1.988969,-1.603116
440,1.377042,-0.283979,3.528384,-0.553405,1.144083,3.335930,0.274861,-0.154330,0.661236,0.534377
482,2.606370,2.060033,-1.543824,-2.044703,1.176270,0.146618,2.259127,0.123062,-1.249804,-1.519634
422,1.278315,1.294314,0.293030,-0.988450,1.542430,1.020542,0.805313,0.488095,-1.390894,-1.268144
778,1.127383,-0.285146,2.943044,-0.872933,-0.455173,1.930541,-0.172349,2.197484,0.290275,-0.431749
...,...,...,...,...,...,...,...,...,...,...
106,1.513959,1.049826,-1.457607,-1.273247,0.816399,-0.249132,1.932263,0.570282,-1.919676,0.845043
270,-1.655109,3.540631,-4.341477,-2.350081,-2.924345,-7.298063,-1.891326,-1.331604,0.617407,1.213015
860,1.616697,-3.828748,4.781310,-1.541705,-0.564641,3.190292,3.638629,-0.582887,-1.290523,0.085957
435,3.258781,6.239423,-0.629061,0.573923,2.358042,3.435767,-4.466020,2.541641,-0.002571,0.156313


In [10]:
# Train the Random Forest model with selected features
rf_selected = RandomForestClassifier(n_estimators=100, random_state=42)
rf_selected.fit(X_train_selected, y_train)

# Evaluate the model
accuracy_after = rf_selected.score(X_test_selected, y_test)
print(f'Accuracy after feature selection: {accuracy_after:.2f}')

Accuracy after feature selection: 0.93


In [1]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

tsne = TSNE(n_components=2, random_state=42)
X_embedded = tsne.fit_transform(X_train)

plt.scatter(X_embedded[:,0], X_embedded[:,1], c=y_dedup, cmap='coolwarm', alpha=0.6)
plt.title("Latent Space Visualization")
plt.colorbar(label="Label")
plt.show()

NameError: name 'X_train' is not defined