In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import plotly.express as px

# Load the dataset
data = pd.read_csv('data-final.csv')

# Display the first few rows of the dataset
print(data.head())

# Exploratory Data Analysis (EDA)
# Check for missing values
print(data.isnull().sum())

# Summary statistics
print(data.describe())

# Visualize the distribution of personality traits
sns.countplot(x='personality_type', data=data)
plt.title('Distribution of Personality Types')
plt.show()

# Data Preprocessing
# Convert categorical variables to numerical if necessary
data = pd.get_dummies(data, drop_first=True)

# Define features and target variable
X = data.drop('personality_type', axis=1)
y = data['personality_type']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train a Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

# Feature Importance
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
features = X.columns

# Plot feature importances
plt.figure(figsize=(10, 6))
plt.title("Feature Importances")
plt.bar(range(X.shape[1]), importances[indices], align='center')
plt.xticks(range(X.shape[1]), features[indices], rotation=90)
plt.show()

# Additional Visualizations
# Bar chart of feature importances
feature_importances_df = pd.DataFrame({
    'Feature': features,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importances_df)
plt.title('Feature Importances')
plt.show()

# Pie chart of target variable distribution
personality_counts = data['personality_type'].value_counts()
fig = px.pie(values=personality_counts, names=personality_counts.index, title='Distribution of Personality Types')
fig.show()


  EXT1\tEXT2\tEXT3\tEXT4\tEXT5\tEXT6\tEXT7\tEXT8\tEXT9\tEXT10\tEST1\tEST2\tEST3\tEST4\tEST5\tEST6\tEST7\tEST8\tEST9\tEST10\tAGR1\tAGR2\tAGR3\tAGR4\tAGR5\tAGR6\tAGR7\tAGR8\tAGR9\tAGR10\tCSN1\tCSN2\tCSN3\tCSN4\tCSN5\tCSN6\tCSN7\tCSN8\tCSN9\tCSN10\tOPN1\tOPN2\tOPN3\tOPN4\tOPN5\tOPN6\tOPN7\tOPN8\tOPN9\tOPN10\tEXT1_E\tEXT2_E\tEXT3_E\tEXT4_E\tEXT5_E\tEXT6_E\tEXT7_E\tEXT8_E\tEXT9_E\tEXT10_E\tEST1_E\tEST2_E\tEST3_E\tEST4_E\tEST5_E\tEST6_E\tEST7_E\tEST8_E\tEST9_E\tEST10_E\tAGR1_E\tAGR2_E\tAGR3_E\tAGR4_E\tAGR5_E\tAGR6_E\tAGR7_E\tAGR8_E\tAGR9_E\tAGR10_E\tCSN1_E\tCSN2_E\tCSN3_E\tCSN4_E\tCSN5_E\tCSN6_E\tCSN7_E\tCSN8_E\tCSN9_E\tCSN10_E\tOPN1_E\tOPN2_E\tOPN3_E\tOPN4_E\tOPN5_E\tOPN6_E\tOPN7_E\tOPN8_E\tOPN9_E\tOPN10_E\tdateload\tscreenw\tscreenh\tintroelapse\ttestelapse\tendelapse\tIPC\tcountry\tlat_appx_lots_of_err\tlong_appx_lots_of_err
0  4\t1\t5\t2\t5\t1\t5\t2\t4\t1\t1\t4\t4\t2\t2\t2...                                                                                                                  

ValueError: Could not interpret value `personality_type` for `x`. An entry with this name does not appear in `data`.