In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("rakeshkapilavai/extrovert-vs-introvert-behavior-data")

print("Path to dataset files:", path)

In [None]:
import os
import pandas as pd
import numpy as np
csv_file="personality_dataset.csv"
data_file_path=os.path.join(path,csv_file)
try:
    df=pd.read_csv(data_file_path)
    print("Data Loaded Successfuly!")
    print(df.head)
except:
    print(f"Error: {csv_filename} not found in {download_path}. Please check the actual file name and path.")

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
df.hist(bins=50,figsize=(20,15))
plt.show()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
split=StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for train_index,test_index in split.split(df,df['Personality']):
    strat_train_set=df.loc[train_index]
    strat_test_set=df.loc[test_index]

In [None]:
strat_train_set['Personality'].value_counts()/len(strat_train_set)

In [None]:
non_numeric=df.select_dtypes(include=['object','category']).columns.to_list()
df_numeric = df.drop(columns=non_numeric)
corr_matrix = df_numeric.corr()
corr_matrix

In [None]:
behaviour = strat_train_set.drop("Personality", axis=1)
behaviour_test=strat_test_set.drop("Personality", axis=1)
behaviour_labels = strat_train_set["Personality"].copy()
behaviour_labels_test=strat_test_set["Personality"].copy()
print(non_numeric)

In [None]:
remove=[x for x in non_numeric if x!='Personality']
behaviour_num = behaviour.drop(remove, axis=1)
behaviour_cat = behaviour[["Stage_fear","Drained_after_socializing"]]
behaviour_num_test = behaviour_test.drop(remove, axis=1)
behaviour_cat_test = behaviour_test[["Stage_fear","Drained_after_socializing"]]


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler',StandardScaler())
])


cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ('encoder', OrdinalEncoder())
])


full_pipeline = ColumnTransformer([
    ('num', num_pipeline, behaviour_num.columns),
    ('cat', cat_pipeline, behaviour_cat.columns)
])

behaviour_prepared = full_pipeline.fit_transform(behaviour)
behaviour_prepared_test = full_pipeline.transform(behaviour_test)

In [None]:
import pandas as pd

num_features = behaviour_num.columns.tolist()
cat_features = behaviour_cat.columns.tolist()
all_features = num_features + cat_features

behaviour_prepared_df = pd.DataFrame(behaviour_prepared, columns=all_features)

print(behaviour_prepared_df.shape)                   
print(behaviour_prepared_df.isnull().sum())          
print(behaviour_prepared_df.Social_event_attendance.count())  



In [None]:
from sklearn.linear_model import LogisticRegression
log_reg=LogisticRegression()
log_reg.fit(behaviour_prepared,behaviour_labels)

In [None]:
from sklearn.model_selection import cross_val_score
scores=cross_val_score(log_reg,behaviour_prepared,behaviour_labels,scoring='accuracy',cv=10)
print("Accuracy scores from each fold:", scores)
print("Mean accuracy:", scores.mean())
print("Standard deviation:", scores.std())

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest_clas = RandomForestClassifier(max_depth=5,max_features='sqrt',random_state=42,class_weight='balanced')
forest_clas.fit(behaviour_prepared, behaviour_labels)

In [None]:
forest_clas_scores=cross_val_score(forest_clas,behaviour_prepared,behaviour_labels,scoring='accuracy',cv=10)
print("Accuracy scores from each fold:", forest_clas_scores)
print("Mean accuracy:", forest_clas_scores.mean())
print("Standard deviation:", forest_clas_scores.std())

In [None]:
from sklearn.tree import DecisionTreeClassifier
des_tree_clas=DecisionTreeClassifier(max_depth=5,max_features='sqrt',random_state=42)
des_tree_clas.fit(behaviour_prepared,behaviour_labels)

In [None]:
d_scores=cross_val_score(des_tree_clas,behaviour_prepared,behaviour_labels,scoring='accuracy',cv=10)
print("Accuracy scores from each fold:", d_scores)
print("Mean accuracy:", d_scores.mean())
print("Standard deviation:", d_scores.std())

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gr_clas=GradientBoostingClassifier()
gr_clas.fit(behaviour_prepared,behaviour_labels)

In [None]:
gr_scores=cross_val_score(gr_clas,behaviour_prepared,behaviour_labels,scoring='accuracy',cv=10)
print("Accuracy scores from each fold:", gr_scores)
print("Mean accuracy:", gr_scores.mean())
print("Standard deviation:", gr_scores.std())

In [None]:
from sklearn.metrics import accuracy_score, classification_report
behaviour_predictions_train=forest_clas.predict(behaviour_prepared)
print("Accuracy:",accuracy_score(behaviour_predictions_train,behaviour_labels))
print(classification_report(behaviour_labels, behaviour_predictions_train))

In [None]:
behaviour_predictions_test=forest_clas.predict(behaviour_prepared_test)
print("Accuracy:",accuracy_score(behaviour_predictions_test,behaviour_labels_test))
print(classification_report(behaviour_labels_test, behaviour_predictions_test))

In [None]:
def predict_personality(
    Time_spent_Alone, Social_event_attendance, Going_outside,
    Friends_circle_size, Post_frequency, Stage_fear, Drained_after_socializing,model
):
    import pandas as pd

    input_data = pd.DataFrame([{
        "Time_spent_Alone": Time_spent_Alone,
        "Social_event_attendance": Social_event_attendance,
        "Going_outside": Going_outside,
        "Friends_circle_size": Friends_circle_size,
        "Post_frequency": Post_frequency,
        "Stage_fear": Stage_fear,
        "Drained_after_socializing": Drained_after_socializing
    }])
    input_data_test = full_pipeline.transform(input_data)
    prediction = model.predict(input_data_test)
    return prediction[0]


In [None]:
'''Time_spent_Alone=int(input("Enter Time Spent Alone"))
Social_event_attendance=int(input("Enter the number of hours you attend any social event"))
Going_outside=int(input("How many hours do you go outside"))
Friends_circle_size=int(input("Whats your Friend Circle size"))
Post_frequency=int(input("Social media post frequency"))
Stage_fear=input("Presence of stage fright (Yes/No)")
Drained_after_socializing=input("Feeling drained after socializing (Yes/No).")
print(predict_personality(
    Time_spent_Alone, Social_event_attendance, Going_outside,
    Friends_circle_size, Post_frequency, Stage_fear, Drained_after_socializing,forest_clas
))'''

In [None]:
print(strat_train_set[strat_train_set['Post_frequency'] == 0]['Personality'].value_counts())


In [None]:
import matplotlib.pyplot as plt

feature_names = behaviour_num.columns.tolist() + behaviour_cat.columns.tolist()

importances = gr_clas.feature_importances_
sorted_idx = importances.argsort()

plt.figure(figsize=(10, 5))
plt.barh(range(len(sorted_idx)), importances[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), [feature_names[i] for i in sorted_idx])
plt.xlabel("Feature Importance")
plt.title("Gradient Boosting - Feature Importances")
plt.tight_layout()
plt.show()


In [None]:
import seaborn as sns
sns.countplot(x='Post_frequency', hue='Personality', data=df)

In [None]:
behaviour_no_post= behaviour.drop('Post_frequency', axis=1)
behaviour_no_post_test=behaviour_test.drop('Post_frequency', axis=1)
behaviour_no_post_num = behaviour_num.drop('Post_frequency', axis=1)
behaviour_no_post_cat = behaviour_no_post[["Stage_fear","Drained_after_socializing"]]

In [None]:
full_pipeline_no_post = ColumnTransformer([
    ('num', num_pipeline, behaviour_no_post_num.columns),
    ('cat', cat_pipeline, behaviour_no_post_cat.columns)
])

behaviour_no_post_prepared = full_pipeline_no_post.fit_transform(behaviour_no_post)
behaviour_no_post_prepared_test = full_pipeline_no_post.transform(behaviour_no_post_test)

In [None]:
forest_clas_new = RandomForestClassifier(max_depth=5,max_features='sqrt',random_state=42,class_weight='balanced')
forest_clas_new.fit(behaviour_no_post_prepared, behaviour_labels)

In [None]:
forest_clas_new_scores=cross_val_score(forest_clas_new,behaviour_no_post_prepared,behaviour_labels,scoring='accuracy',cv=10)
print("Accuracy scores from each fold:", forest_clas_new_scores)
print("Mean accuracy:", forest_clas_new_scores.mean())
print("Standard deviation:", forest_clas_new_scores.std())

In [None]:
from sklearn.metrics import accuracy_score, classification_report
behaviour_predictions_no_post_train=forest_clas_new.predict(behaviour_no_post_prepared)
print("Accuracy:",accuracy_score(behaviour_predictions_no_post_train,behaviour_labels))
print(classification_report(behaviour_labels, behaviour_predictions_no_post_train))

In [None]:
behaviour_predictions_no_post_test=forest_clas_new.predict(behaviour_no_post_prepared_test)
print("Accuracy:",accuracy_score(behaviour_predictions_no_post_test,behaviour_labels_test))
print(classification_report(behaviour_labels_test, behaviour_predictions_no_post_test))

In [None]:
def predict_personality_new(
    Time_spent_Alone, Social_event_attendance, Going_outside,
    Friends_circle_size, Stage_fear, Drained_after_socializing,model
):
    import pandas as pd

    input_data = pd.DataFrame([{
        "Time_spent_Alone": Time_spent_Alone,
        "Social_event_attendance": Social_event_attendance,
        "Going_outside": Going_outside,
        "Friends_circle_size": Friends_circle_size,
        "Stage_fear": Stage_fear,
        "Drained_after_socializing": Drained_after_socializing
    }])
    input_data_test = full_pipeline_no_post.transform(input_data)
    prediction = model.predict(input_data_test)
    return prediction[0]


In [None]:
Time_spent_Alone=int(input("Enter Time Spent Alone"))
Social_event_attendance=int(input("Enter the number of hours you attend any social event"))
Going_outside=int(input("How many hours do you go outside"))
Friends_circle_size=int(input("Whats your Friend Circle size"))
Stage_fear=input("Presence of stage fright (Yes/No)")
Drained_after_socializing=input("Feeling drained after socializing (Yes/No).")
print(predict_personality_new(
    Time_spent_Alone, Social_event_attendance, Going_outside,
    Friends_circle_size, Stage_fear, Drained_after_socializing,forest_clas_new
))