<a href="https://colab.research.google.com/github/singhShiven/Health-Connect/blob/main/personality_predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install seaborn
!pip  install kagglehub
!pip install kaggle




In [None]:
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("rakeshkapilavai/extrovert-vs-introvert-behavior-data")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/rakeshkapilavai/extrovert-vs-introvert-behavior-data?dataset_version_number=1...


100%|██████████| 15.0k/15.0k [00:00<00:00, 2.76MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/rakeshkapilavai/extrovert-vs-introvert-behavior-data/versions/1





In [None]:
data=pd.DataFrame(pd.read_csv("/kaggle/input/extrovert-vs-introvert-behavior-data/personality_dataset.csv"))
data.head(-5)

Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,4.0,No,4.0,6.0,No,13.0,5.0,Extrovert
1,9.0,Yes,0.0,0.0,Yes,0.0,3.0,Introvert
2,9.0,Yes,1.0,2.0,Yes,5.0,2.0,Introvert
3,0.0,No,6.0,7.0,No,14.0,8.0,Extrovert
4,3.0,No,9.0,4.0,No,8.0,5.0,Extrovert


In [None]:
# Verify required columns
required_columns = ['Time_spent_Alone', 'Stage_fear', 'Social_event_attendance', 'Going_outside',
                    'Drained_after_socializing', 'Friends_circle_size', 'Post_frequency', 'Personality']
if not all(col in data.columns for col in required_columns):
    raise ValueError("Missing required columns in dataset")



In [None]:
numeric_columns = ['Time_spent_Alone', 'Social_event_attendance', 'Going_outside',
                  'Friends_circle_size', 'Post_frequency']
categorical_columns = ['Stage_fear', 'Drained_after_socializing']
target_column = 'Personality'

In [None]:
# Verify categorical values
for col in categorical_columns:
    print(f"\nUnique values in {col}:")
    print(data[col].value_counts(dropna=False))


Unique values in Stage_fear:
Stage_fear
No     1417
Yes    1410
NaN      73
Name: count, dtype: int64

Unique values in Drained_after_socializing:
Drained_after_socializing
No     1441
Yes    1407
NaN      52
Name: count, dtype: int64


In [None]:
# Display basic information
print("Dataset Shape:", data.shape)
print("\nDataset Info:")
print(data.info())
print("\nMissing Values:")
print(data.isnull().sum())

Dataset Shape: (2900, 8)

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2900 entries, 0 to 2899
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Time_spent_Alone           2837 non-null   float64
 1   Stage_fear                 2827 non-null   object 
 2   Social_event_attendance    2838 non-null   float64
 3   Going_outside              2834 non-null   float64
 4   Drained_after_socializing  2848 non-null   object 
 5   Friends_circle_size        2823 non-null   float64
 6   Post_frequency             2835 non-null   float64
 7   Personality                2900 non-null   object 
dtypes: float64(5), object(3)
memory usage: 181.4+ KB
None

Missing Values:
Time_spent_Alone             63
Stage_fear                   73
Social_event_attendance      62
Going_outside                66
Drained_after_socializing    52
Friends_circle_size          77
Post_frequency            

In [None]:
# Encode target variable
le = LabelEncoder()
data[target_column] = le.fit_transform(data[target_column])  # Extrovert=1, Introvert=0

# Handle missing values
numeric_imputer = SimpleImputer(strategy='median')
data[numeric_columns] = numeric_imputer.fit_transform(data[numeric_columns])
categorical_imputer = SimpleImputer(strategy='most_frequent')
data[categorical_columns] = categorical_imputer.fit_transform(data[categorical_columns])

# Encode categorical features
data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

# Cap outliers (less aggressive for extrovert-indicating features)
for col in numeric_columns:
  Q1 = data[col].quantile(0.25)
  Q3 = data[col].quantile(0.75)
  IQR = Q3 - Q1
  lower_bound = Q1 - 1.5 * IQR
    # Less aggressive upper bound for extrovert-related features
upper_bound = Q3 + 2.5 * IQR if col in ['Social_event_attendance', 'Friends_circle_size', 'Post_frequency'] else Q3 + 1.5 * IQR
data[col] = data[col].clip(lower=lower_bound, upper=upper_bound)

# Compute bins for Time_spent_Alone
time_alone_bins = pd.qcut(data['Time_spent_Alone'], q=3, retbins=True)[1]

# Train-test split
X = data.drop(columns=[target_column])
y = data[target_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)

# Interaction features
X_train['Alone_to_Social_Ratio'] = X_train['Time_spent_Alone'] / (X_train['Social_event_attendance'] + 1)
X_test['Alone_to_Social_Ratio'] = X_test['Time_spent_Alone'] / (X_test['Social_event_attendance'] + 1)
X_train['Social_Comfort_Index'] = (X_train['Friends_circle_size'] + X_train['Post_frequency'] - X_train['Stage_fear_Yes']) / 3
X_test['Social_Comfort_Index'] = (X_test['Friends_circle_size'] + X_test['Post_frequency'] - X_test['Stage_fear_Yes']) / 3
X_train['Social_Overload'] = X_train['Drained_after_socializing_Yes'] * X_train['Social_event_attendance']
X_test['Social_Overload'] = X_test['Drained_after_socializing_Yes'] * X_test['Social_event_attendance']

# Binned features
X_train['Time_spent_Alone_Binned'] = pd.cut(X_train['Time_spent_Alone'], bins=time_alone_bins, labels=['Low', 'Medium', 'High'], include_lowest=True)
X_test['Time_spent_Alone_Binned'] = pd.cut(X_test['Time_spent_Alone'], bins=time_alone_bins, labels=['Low', 'Medium', 'High'], include_lowest=True)
X_train = pd.get_dummies(X_train, columns=['Time_spent_Alone_Binned'], drop_first=True)
X_test = pd.get_dummies(X_test, columns=['Time_spent_Alone_Binned'], drop_first=True)

# Polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
poly_features_train = poly.fit_transform(X_train[['Time_spent_Alone', 'Social_event_attendance', 'Friends_circle_size']])
poly_features_test = poly.transform(X_test[['Time_spent_Alone', 'Social_event_attendance', 'Friends_circle_size']])
poly_feature_names = poly.get_feature_names_out(['Time_spent_Alone', 'Social_event_attendance', 'Friends_circle_size'])
X_train[poly_feature_names] = poly_features_train
X_test[poly_feature_names] = poly_features_test

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Train RandomForest model
from sklearn.linear_model import LogisticRegression
l_model = LogisticRegression(max_iter=1000,solver='lbfgs',C= 0.0006951927961775605)
l_model.fit(X_train_scaled, y_train)


In [None]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, l_model.predict(X_test_scaled))
print("Model Accuracy:", accuracy)

Model Accuracy: 0.9172413793103448


In [None]:
def predict_personality(input_dict):
    df_input = pd.DataFrame([input_dict])

    # Handle missing values
    df_input[numeric_columns] = numeric_imputer.transform(df_input[numeric_columns])
    df_input[categorical_columns] = categorical_imputer.transform(df_input[categorical_columns])

    # One-hot encoding for categorical features
    df_input = pd.get_dummies(df_input, columns=categorical_columns, drop_first=True)

    # Ensure dummy columns exist
    for col in ['Stage_fear_Yes', 'Drained_after_socializing_Yes']:
        if col not in df_input.columns:
            df_input[col] = 0

    # Outlier capping
    for col in numeric_columns:
        Q1 = X[col].quantile(0.25)
        Q3 = X[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 2.5 * IQR if col in ['Social_event_attendance', 'Friends_circle_size', 'Post_frequency'] else Q3 + 1.5 * IQR
        df_input[col] = df_input[col].clip(lower=lower_bound, upper=upper_bound)

    # Interaction features
    df_input['Alone_to_Social_Ratio'] = df_input['Time_spent_Alone'] / (df_input['Social_event_attendance'] + 1)
    df_input['Social_Comfort_Index'] = (df_input['Friends_circle_size'] + df_input['Post_frequency'] - df_input['Stage_fear_Yes']) / 3
    df_input['Social_Overload'] = df_input['Drained_after_socializing_Yes'] * df_input['Social_event_attendance']

    # Binned feature
    df_input['Time_spent_Alone_Binned'] = pd.cut(df_input['Time_spent_Alone'], bins=time_alone_bins, labels=['Low', 'Medium', 'High'], include_lowest=True)
    df_input = pd.get_dummies(df_input, columns=['Time_spent_Alone_Binned'], drop_first=True)

    for col in ['Time_spent_Alone_Binned_Medium', 'Time_spent_Alone_Binned_High']:
        if col not in df_input.columns:
            df_input[col] = 0

    # Polynomial features
    poly_input = poly.transform(df_input[['Time_spent_Alone', 'Social_event_attendance', 'Friends_circle_size']])
    df_input[poly_feature_names] = poly_input

    # Ensure all training columns are present
    for col in X_train.columns:
        if col not in df_input.columns:
            df_input[col] = 0

    # Ensure column order
    df_input = df_input[X_train.columns]

    # Scale
    input_scaled = scaler.transform(df_input)

    # Predict
    prediction = l_model.predict(input_scaled)[0]
    return 'Extrovert' if prediction == 0 else 'Introvert'


In [None]:
# Example usage
user_input = {
    'Time_spent_Alone': 19,
    'Stage_fear': 'Yes',
    'Social_event_attendance': 5,
    'Going_outside': 3,
    'Drained_after_socializing': 'Yes',
    'Friends_circle_size':4,
    'Post_frequency': 2
}

In [None]:
result = predict_personality(user_input)
print("Predicted Personality:", result)

Predicted Personality: Introvert


In [None]:
!pip install joblib



In [None]:
import joblib

# Save the model and scaler
joblib.dump(l_model, 'personality_model.joblib')
joblib.dump(scaler, 'scaler.joblib')
joblib.dump(numeric_imputer, 'numeric_imputer.joblib')
joblib.dump(categorical_imputer, 'categorical_imputer.joblib')
joblib.dump(poly, 'polynomial_features.joblib')
joblib.dump(time_alone_bins, 'time_alone_bins.joblib')

print("Model and related objects saved successfully.")

Model and related objects saved successfully.


In [1]:
!pip install pyforest

Collecting pyforest
  Downloading pyforest-1.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyforest
  Building wheel for pyforest (setup.py) ... [?25l[?25hdone
  Created wheel for pyforest: filename=pyforest-1.1.2-py2.py3-none-any.whl size=15902 sha256=ddb4f5ae76e2ff0ce8913ec1b28e522d604710a695bb78b583abb34b22f36047
  Stored in directory: /root/.cache/pip/wheels/50/94/94/c8224da834a4db55f67c86feb7c742111ff051d5f3e17127d8
Successfully built pyforest
Installing collected packages: pyforest
Successfully installed pyforest-1.1.2
