# TITLE: Personality Predictor Model 
## GOAL: predict wether a person is extrovert or introvert


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder


## step 1: load the dataset

In [9]:

df = pd.read_csv('personality_datasert.csv')        
df

Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,4.0,No,4.0,6.0,No,13.0,5.0,Extrovert
1,9.0,Yes,0.0,0.0,Yes,0.0,3.0,Introvert
2,9.0,Yes,1.0,2.0,Yes,5.0,2.0,Introvert
3,0.0,No,6.0,7.0,No,14.0,8.0,Extrovert
4,3.0,No,9.0,4.0,No,8.0,5.0,Extrovert
...,...,...,...,...,...,...,...,...
2895,3.0,No,7.0,6.0,No,6.0,6.0,Extrovert
2896,3.0,No,8.0,3.0,No,14.0,9.0,Extrovert
2897,4.0,Yes,1.0,1.0,Yes,4.0,0.0,Introvert
2898,11.0,Yes,1.0,3.0,Yes,2.0,0.0,Introvert


### information about the data

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2900 entries, 0 to 2899
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Time_spent_Alone           2900 non-null   float64
 1   Stage_fear                 2900 non-null   object 
 2   Social_event_attendance    2900 non-null   float64
 3   Going_outside              2900 non-null   float64
 4   Drained_after_socializing  2900 non-null   object 
 5   Friends_circle_size        2900 non-null   float64
 6   Post_frequency             2900 non-null   float64
 7   Personality                2900 non-null   object 
dtypes: float64(5), object(3)
memory usage: 181.4+ KB


## step 2: copy the original dataframe for preprocessing

In [11]:
df_processed = df.copy()
df_processed

Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,4.0,No,4.0,6.0,No,13.0,5.0,Extrovert
1,9.0,Yes,0.0,0.0,Yes,0.0,3.0,Introvert
2,9.0,Yes,1.0,2.0,Yes,5.0,2.0,Introvert
3,0.0,No,6.0,7.0,No,14.0,8.0,Extrovert
4,3.0,No,9.0,4.0,No,8.0,5.0,Extrovert
...,...,...,...,...,...,...,...,...
2895,3.0,No,7.0,6.0,No,6.0,6.0,Extrovert
2896,3.0,No,8.0,3.0,No,14.0,9.0,Extrovert
2897,4.0,Yes,1.0,1.0,Yes,4.0,0.0,Introvert
2898,11.0,Yes,1.0,3.0,Yes,2.0,0.0,Introvert


## step 3: convert the categorical yes/no to 1/0

In [12]:
binary_column = ['Stage_fear','Drained_after_socializing'] 
for col in binary_column:
    df_processed[col] = df_processed[col].map({'Yes':1,'No':0})

## step 4: encode the target column 'personality'
### 'introvert' = 1, 'extrovert' = 0

In [13]:
label_encoder = LabelEncoder()
df_processed['Personality']=label_encoder.fit_transform(df_processed['Personality'])

print(df_processed.head())

   Time_spent_Alone  Stage_fear  ...  Post_frequency  Personality
0               4.0           0  ...             5.0            0
1               9.0           1  ...             3.0            1
2               9.0           1  ...             2.0            1
3               0.0           0  ...             8.0            0
4               3.0           0  ...             5.0            0

[5 rows x 8 columns]


# TRAIN THE MODEL 

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix


In [16]:
X = df_processed.drop('Personality', axis = 1)
y = df_processed['Personality']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

y_pred  = model.predict(X_test)

print("ACCURACY: ",accuracy_score(y_test, y_pred))
print("\n CLASSIFICATION REPORT: \n",classification_report(y_test,y_pred))
print("\n CONFUSION MATRIX: \n",confusion_matrix(y_test,y_pred))

ACCURACY:  0.9241379310344827

 CLASSIFICATION REPORT: 
               precision    recall  f1-score   support

           0       0.94      0.92      0.93       302
           1       0.91      0.93      0.92       278

    accuracy                           0.92       580
   macro avg       0.92      0.92      0.92       580
weighted avg       0.92      0.92      0.92       580


 CONFUSION MATRIX: 
 [[277  25]
 [ 19 259]]


In [17]:
import joblib

joblib.dump(model,'personality_predictor.pkl')

['personality_predictor.pkl']