<a href="https://colab.research.google.com/github/rojadasappa/ml_introvert-extrovert-prediction/blob/main/Personality_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load dataset
df = pd.read_csv("/content/personality_datasert.csv")  # change filename

# Detect column data types based on first valid entry
expected_types = {}
for col in df.columns:
    first_valid = df[col].dropna().iloc[0]
    if isinstance(first_valid, (int, float)):
        expected_types[col] = "numeric"
    else:
        # Check if it's actually numeric string
        try:
            float(first_valid)
            expected_types[col] = "numeric"
        except:
            expected_types[col] = "string"

# Function to check if a value matches expected type
def matches_type(val, expected_type):
    if pd.isna(val):
        return False
    if expected_type == "numeric":
        try:
            float(val)
            return True
        except:
            return False
    elif expected_type == "string":
        return isinstance(val, str)
    return False

# Filter out rows with anomalies
mask = df.apply(lambda row: all(matches_type(row[col], expected_types[col]) for col in df.columns), axis=1)
df_cleaned = df[mask].copy()

# Convert numeric columns to float
for col, t in expected_types.items():
    if t == "numeric":
        df_cleaned[col] = df_cleaned[col].astype(float)

# Label encode string columns
le = LabelEncoder()
for col, t in expected_types.items():
    if t == "string":
        df_cleaned[col] = le.fit_transform(df_cleaned[col])

# Save cleaned dataset
df_cleaned.to_csv("cleaned_dataset.csv", index=False)

print(f"✅ Cleaning complete. {len(df) - len(df_cleaned)} rows removed due to anomalies.")

✅ Cleaning complete. 0 rows removed due to anomalies.


In [None]:
import pandas as pd

# Load dataset
df = pd.read_csv("/content/cleaned_dataset.csv")  # change filename

# Columns expected to be integers
expected_int_cols = [
    "Time_spent_Alone",
    "Stage_fear",
    "Social_event_attendance",
    "Going_outside",
    "Drained_after_socializing",
    "Friends_circle_size",
    "Post_frequency",
    "Personality"
]

# Function to check if a value is strictly an integer
def is_integer_value(val):
    try:
        # Must be numeric and have no decimal part
        return float(val).is_integer()
    except:
        return False

# Keep only rows where all expected columns are integers
mask = df.apply(lambda row: all(is_integer_value(row[col]) for col in expected_int_cols), axis=1)
df_cleaned = df[mask].copy()

# Convert all expected columns to int type
df_cleaned[expected_int_cols] = df_cleaned[expected_int_cols].astype(int)

# Save cleaned dataset
df_cleaned.to_csv("cleaned_dataset2.csv", index=False)

print(f"✅ Cleaning complete. {len(df) - len(df_cleaned)} rows removed due to anomalies.")

✅ Cleaning complete. 257 rows removed due to anomalies.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Load cleaned dataset
df = pd.read_csv("/content/cleaned_dataset2.csv")  # Change filename if needed

# Target variable (change to your target column)
target_col = "Personality"

# Separate features and target
X = df.drop(columns=[target_col])
y = df[target_col]

# Encode any categorical columns in features
label_encoders = {}
for col in X.columns:
    if X[col].dtype == object:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])
        label_encoders[col] = le

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
print("✅ Model Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Save model if needed
import joblib
joblib.dump(model, "personality_classifier.joblib")
print("💾 Model saved as 'personality_classifier.joblib'")

✅ Model Accuracy: 0.9035916824196597

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.91      0.91       267
           1       0.91      0.89      0.90       262

    accuracy                           0.90       529
   macro avg       0.90      0.90      0.90       529
weighted avg       0.90      0.90      0.90       529

💾 Model saved as 'personality_classifier.joblib'


In [None]:
import joblib
import pandas as pd

# Load the scikit-learn model
model = joblib.load("personality_classifier.joblib")  # ✅ Correct way

# Collect user input
fields = [
    "Time_spent_Alone",
    "Stage_fear",
    "Social_event_attendance",
    "Going_outside",
    "Drained_after_socializing",
    "Friends_circle_size",
    "Post_frequency"
]

user_data = {}
for field in fields:
    val = int(input(f"Enter value for {field}: "))  # All expected ints
    user_data[field] = val

# Prepare DataFrame
X_new = pd.DataFrame([user_data])

# Get prediction
prediction = model.predict(X_new)[0]
print(f"Predicted Personality: {prediction}")

Enter value for Time_spent_Alone: 6
Enter value for Stage_fear: 1
Enter value for Social_event_attendance: 0
Enter value for Going_outside: 1
Enter value for Drained_after_socializing: 1
Enter value for Friends_circle_size: 10
Enter value for Post_frequency: 2
Predicted Personality: 1
