In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
df_train = pd.read_csv('/kaggle/input/playground-series-s5e7/train.csv')
df_test = pd.read_csv('/kaggle/input/playground-series-s5e7/test.csv')

print(df_train.head())
print(df_test.head())

Initial Exploratory Data Analysis (EDA) on df_train to understand its structure and contents

In [None]:
print(df_train.info())

In [None]:
print(df_train.head())

In [None]:
df_train.describe()

In [None]:
print(df_train.isnull().sum())

In [None]:
cols_nos = df_train[['Time_spent_Alone', 'Social_event_attendance', 'Going_outside', 'Friends_circle_size', 'Post_frequency']]

In [None]:
mean_time_spent_alone = df_train['Time_spent_Alone'].mean()

In [None]:
df_train['Time_spent_Alone'] = df_train['Time_spent_Alone'].fillna(mean_time_spent_alone)
print(df_train['Time_spent_Alone'].info())

In [None]:
print(df_train.info())

In [None]:
imputation_means = {}
for col in cols_nos:
    imputation_means[col] = df_train[col].mean()
    df_train[col] = df_train[col].fillna(imputation_means[col])

In [None]:
print("Calculated Means for Imputation:")
print(imputation_means)

print("\ndf_train.info() after numerical imputation:")
print(df_train.info())

In [None]:
cols_cat = df_train[['Stage_fear', 'Drained_after_socializing']]

In [None]:
(df_train['Stage_fear'].mode())

In [None]:
imputation_modes = {}
for col in cols_cat:
    imputation_modes[col] = df_train[col].mode()[0]
    df_train[col] = df_train[col].fillna(imputation_modes[col])

In [None]:
print("Calculated Modes for Imputation:")
print(imputation_modes)

print("\ndf_train.info() after categorical imputation:")
print(df_train.info())

In [None]:
cols_to_clean_string = ['Stage_fear', 'Drained_after_socializing']

for col in cols_to_clean_string:
    df_train[col] = df_train[col].astype(str).str.strip().str.lower()

print("--- After String Hygiene ---")
for col in cols_to_clean_string:
    print(f"\nColumn: {col}")
    print(df_train[col].unique())
    print(df_train[col].value_counts())

In [None]:
mapping = {'no': 0, 'yes': 1}
df_train[cols_to_clean_string] = df_train[cols_to_clean_string].replace(mapping)
print(df_train[cols_to_clean_string].head())

In [None]:
print(df_train.info())

In [None]:
df_train['Personality'].unique()

In [None]:
df_train['Personality'].astype(str).str.strip().str.lower()
mapping = {'Extrovert': 1, 'Introvert': 0}
df_train['Personality'] = df_train['Personality'].replace(mapping)

In [None]:
df_train['Personality'].unique()
print(df_train.info())

In [None]:
df_train.columns

In [None]:
# X = df_train[['Time_spent_Alone', 'Stage_fear', 'Social_event_attendance',
#        'Going_outside', 'Drained_after_socializing', 'Friends_circle_size',
#        'Post_frequency']]
X = df_train.drop(columns = ['id', 'Personality'])
y = df_train['Personality']

In [None]:
print(X.columns)

In [None]:
from sklearn.linear_model import LogisticRegression


In [None]:
model = LogisticRegression(random_state=16)

In [None]:
model.fit(X, y)

In [None]:
df_test.head()

df_test.info()

df_test.isnull().sum()

In [None]:
cols_nos = df_test[['Time_spent_Alone', 'Social_event_attendance', 'Going_outside', 'Friends_circle_size', 'Post_frequency']]

In [None]:
for col in cols_nos:
    mean_from_train = imputation_means[col] 
    df_test[col] = df_test[col].fillna(mean_from_train)

In [None]:
print(df_test.info())

In [None]:
cols_cat = df_train[['Stage_fear', 'Drained_after_socializing']]

In [None]:
for col in cols_cat:
    df_test[col] = df_test[col].fillna(imputation_modes[col])

In [None]:
print(df_test.info())

In [None]:
cols_to_clean_string = ['Stage_fear', 'Drained_after_socializing']

for col in cols_to_clean_string:
    df_test[col] = df_test[col].astype(str).str.strip().str.lower()

In [None]:
print("--- After String Hygiene ---")
for col in cols_to_clean_string:
    print(f"\nColumn: {col}")
    print(df_test[col].unique())
    print(df_test[col].value_counts())

In [None]:
df_test[cols_to_clean_string].head()

In [None]:
mapping = {'no': 0, 'yes': 1}
df_test[cols_to_clean_string] = df_test[cols_to_clean_string].replace(mapping)
print(df_test[cols_to_clean_string].head())

In [None]:
df_test[cols_to_clean_string].head()

In [None]:
print(df_test.info())

In [None]:
X_test = df_test[X.columns]

In [None]:
test_predictions_numeric = model.predict(X_test)

print("Predictions made successfully! Here are the first 5 predictions (numeric):")
print(test_predictions_numeric[:5])

In [None]:
print(test_predictions_numeric)

In [None]:
reverse_map = {0: 'Introvert', 1: "Extrovert"}

In [None]:
s = pd.Series(test_predictions_numeric).map(reverse_map)

In [None]:
print(s)

In [None]:
submission_df = pd.DataFrame({
    'id': df_test['id'],          
    'Personality': s              
})

In [None]:
print(submission_df.head())

In [None]:
submission_df.to_csv('submission.csv', index=False)