In [202]:
import os
from zipfile import ZipFile
import json
import numpy as np
import pandas as pd

In [203]:
kaggle_credentials = json.load(open("kaggle.json"))

In [204]:
os.environ["KAGGLE_USERNAME"] = kaggle_credentials["username"]
os.environ["KAGGLE_KEY"] = kaggle_credentials["key"]

In [205]:
!kaggle competitions download -c playground-series-s5e7

playground-series-s5e7.zip: Skipping, found more recently modified local copy (use --force to force download)


In [206]:
with ZipFile("playground-series-s5e7.zip", "r") as zipObj:
    zipObj.extractall()

In [207]:
data = pd.read_csv("train.csv")

In [208]:
data.head()

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0,0.0,No,6.0,4.0,No,15.0,5.0,Extrovert
1,1,1.0,No,7.0,3.0,No,10.0,8.0,Extrovert
2,2,6.0,Yes,1.0,0.0,,3.0,0.0,Introvert
3,3,3.0,No,7.0,3.0,No,11.0,5.0,Extrovert
4,4,1.0,No,4.0,4.0,No,13.0,,Extrovert


In [209]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18524 entries, 0 to 18523
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         18524 non-null  int64  
 1   Time_spent_Alone           17334 non-null  float64
 2   Stage_fear                 16631 non-null  object 
 3   Social_event_attendance    17344 non-null  float64
 4   Going_outside              17058 non-null  float64
 5   Drained_after_socializing  17375 non-null  object 
 6   Friends_circle_size        17470 non-null  float64
 7   Post_frequency             17260 non-null  float64
 8   Personality                18524 non-null  object 
dtypes: float64(5), int64(1), object(3)
memory usage: 1.3+ MB


In [210]:
data["Stage_fear"].value_counts()

Unnamed: 0_level_0,count
Stage_fear,Unnamed: 1_level_1
No,12609
Yes,4022


In [211]:
data["Drained_after_socializing"].value_counts()

Unnamed: 0_level_0,count
Drained_after_socializing,Unnamed: 1_level_1
No,13313
Yes,4062


In [212]:
data.isnull().sum()

Unnamed: 0,0
id,0
Time_spent_Alone,1190
Stage_fear,1893
Social_event_attendance,1180
Going_outside,1466
Drained_after_socializing,1149
Friends_circle_size,1054
Post_frequency,1264
Personality,0


#Data cleaning, handling missing values and preprocessing


In [213]:
#droping the id column
data.drop("id", axis=1, inplace=True)

In [214]:
#handling the missing value of numerical column
numerical_cols = [
    'Time_spent_Alone',
    'Social_event_attendance',
    'Going_outside',
    'Friends_circle_size',
    'Post_frequency']

#Using Median Imputation for missing values
for col in numerical_cols:
    data[col].fillna(data[col].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].median(), inplace=True)


In [215]:
#handling the missing value of categorical column
categorical_cols = [
    'Stage_fear',
    'Drained_after_socializing'
]

#Use Mode Imputation or "Unknown"
for col in categorical_cols:
    data[col].fillna(data[col].mode()[0], inplace=True)
    # or: df[col].fillna("Unknown", inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mode()[0], inplace=True)


In [216]:
data.isnull().sum()

Unnamed: 0,0
Time_spent_Alone,0
Stage_fear,0
Social_event_attendance,0
Going_outside,0
Drained_after_socializing,0
Friends_circle_size,0
Post_frequency,0
Personality,0


In [217]:
#Encoding Categorical Features like yes and no
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
data['Stage_fear'] = le.fit_transform(data['Stage_fear'])
data['Drained_after_socializing'] = le.fit_transform(data['Drained_after_socializing'])

In [218]:
#encoding target column
data['Personality'] = data['Personality'].map({'Introvert': 0, 'Extrovert': 1})

In [219]:
data.head()

Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0.0,0,6.0,4.0,0,15.0,5.0,1
1,1.0,0,7.0,3.0,0,10.0,8.0,1
2,6.0,1,1.0,0.0,0,3.0,0.0,0
3,3.0,0,7.0,3.0,0,11.0,5.0,1
4,1.0,0,4.0,4.0,0,13.0,5.0,1


#Train-Test Split Code

In [220]:
from sklearn.model_selection import train_test_split

# Step 1: Separate features and target
X = data.drop('Personality', axis=1)   # Features
y = data['Personality']                # Target

# Step 2: Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Optional: Check the shape
print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)


Training set shape: (14819, 7)
Test set shape: (3705, 7)


#randomforest model

In [221]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Step 1: Initialize model
#rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Step 2: Fit the model
#rf.fit(X_train, y_train)

# Step 3: Predict
#y_pred = rf.predict(X_test)

# Step 4: Evaluate
#accuracy = accuracy_score(y_test, y_pred)
#print("Random Forest Accuracy:", accuracy)**

#using xgboost model

In [222]:
from xgboost import XGBClassifier

#xgb = XGBClassifier(random_state=42, n_estimators=100)
#xgb.fit(X_train, y_train)
#y_pred = xgb.predict(X_test)
#accuracy = accuracy_score(y_test, y_pred)
#print("Xgboost Accuracy:", accuracy)

In [223]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression

ensemble = VotingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators=100)),
        ('xgb', XGBClassifier(n_estimators=100)),
        ('lr', LogisticRegression())
    ],
    voting='soft'
)

ensemble.fit(X_train, y_train)
y_pred = ensemble.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Xgboost Accuracy:", accuracy)

Xgboost Accuracy: 0.964642375168691


#Checking for test dataset

In [224]:
test_df = pd.read_csv("test.csv")
test_df.head()

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency
0,18524,3.0,No,7.0,4.0,No,6.0,
1,18525,,Yes,0.0,0.0,Yes,5.0,1.0
2,18526,3.0,No,5.0,6.0,No,15.0,9.0
3,18527,3.0,No,4.0,4.0,No,5.0,6.0
4,18528,9.0,Yes,1.0,2.0,Yes,1.0,1.0


In [225]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6175 entries, 0 to 6174
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         6175 non-null   int64  
 1   Time_spent_Alone           5750 non-null   float64
 2   Stage_fear                 5577 non-null   object 
 3   Social_event_attendance    5778 non-null   float64
 4   Going_outside              5709 non-null   float64
 5   Drained_after_socializing  5743 non-null   object 
 6   Friends_circle_size        5825 non-null   float64
 7   Post_frequency             5767 non-null   float64
dtypes: float64(5), int64(1), object(2)
memory usage: 386.1+ KB


In [226]:
test_df.isnull().sum()

Unnamed: 0,0
id,0
Time_spent_Alone,425
Stage_fear,598
Social_event_attendance,397
Going_outside,466
Drained_after_socializing,432
Friends_circle_size,350
Post_frequency,408


In [227]:
#Store 'id' for final output
submission_ids = test_df['id']

In [228]:
# Step 3: Preprocess test data (same as training)
# Drop 'id'
test_df = test_df.drop(columns=['id'])

# Fill missing numerical values
numerical_cols = [
    'Time_spent_Alone', 'Social_event_attendance',
    'Going_outside', 'Friends_circle_size', 'Post_frequency'
]
for col in numerical_cols:
    test_df[col] = test_df[col].fillna(test_df[col].median())

# Fill missing categorical values
categorical_cols = ['Stage_fear', 'Drained_after_socializing']
for col in categorical_cols:
    test_df[col] = test_df[col].fillna(test_df[col].mode()[0])

In [229]:
test_df['Stage_fear'] = le.fit_transform(test_df['Stage_fear'])
test_df['Drained_after_socializing'] = le.fit_transform(test_df['Drained_after_socializing'])

In [230]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6175 entries, 0 to 6174
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Time_spent_Alone           6175 non-null   float64
 1   Stage_fear                 6175 non-null   int64  
 2   Social_event_attendance    6175 non-null   float64
 3   Going_outside              6175 non-null   float64
 4   Drained_after_socializing  6175 non-null   int64  
 5   Friends_circle_size        6175 non-null   float64
 6   Post_frequency             6175 non-null   float64
dtypes: float64(5), int64(2)
memory usage: 337.8 KB


In [231]:
# Step 3: Predict
y_pred = ensemble.predict(test_df)

In [232]:
submission = pd.DataFrame({
    'id': submission_ids,
    'Personality': ['Introvert' if label == 0 else 'Extrovert' for label in y_pred]
})

In [233]:
submission.head()

Unnamed: 0,id,Personality
0,18524,Extrovert
1,18525,Introvert
2,18526,Extrovert
3,18527,Extrovert
4,18528,Introvert


In [234]:
#Save to CSV
#submission.to_csv("submission.csv", index=False)
print("✅ Submission file saved as submission.csv")

✅ Submission file saved as submission.csv
