In [1]:
from unicodedata import category
import numpy as np
import pandas as pd
import ipywidgets
from ydata_profiling import ProfileReport
import seaborn as sns
import xgboost as xgb
from category_encoders import TargetEncoder
from sklearn import datasets, linear_model, metrics, model_selection, svm
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score, cross_val_predict
from catboost import CatBoostClassifier
from ydata_profiling.model.dataframe import preprocess

# Data overview

## Data load

In [2]:
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

## Data snapshot

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18524 entries, 0 to 18523
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         18524 non-null  int64  
 1   Time_spent_Alone           17334 non-null  float64
 2   Stage_fear                 16631 non-null  object 
 3   Social_event_attendance    17344 non-null  float64
 4   Going_outside              17058 non-null  float64
 5   Drained_after_socializing  17374 non-null  object 
 6   Friends_circle_size        17470 non-null  float64
 7   Post_frequency             17260 non-null  float64
 8   Personality                18524 non-null  object 
dtypes: float64(5), int64(1), object(3)
memory usage: 1.3+ MB


In [4]:
df.head()

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0,0.0,No,6.0,4.0,No,15.0,5.0,Extrovert
1,1,1.0,No,7.0,3.0,No,10.0,8.0,Extrovert
2,2,6.0,Yes,1.0,0.0,,3.0,0.0,Introvert
3,3,3.0,No,7.0,3.0,No,11.0,5.0,Extrovert
4,4,1.0,No,4.0,4.0,No,13.0,,Extrovert


In [5]:
df.describe()

Unnamed: 0,id,Time_spent_Alone,Social_event_attendance,Going_outside,Friends_circle_size,Post_frequency
count,18524.0,17334.0,17344.0,17058.0,17470.0,17260.0
mean,9261.5,3.137764,5.265106,4.044319,7.996737,4.982097
std,5347.562529,3.003786,2.753359,2.06258,4.223484,2.879139
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,4630.75,1.0,3.0,3.0,5.0,3.0
50%,9261.5,2.0,5.0,4.0,8.0,5.0
75%,13892.25,4.0,8.0,6.0,12.0,7.0
max,18523.0,11.0,10.0,7.0,15.0,10.0


## Data report

In [6]:
#profile = ProfileReport(df, title="Report", explorative=True)
#profile.to_file("report.html")

## Train and Test

In [7]:
X = df.iloc[:, 1:-1]
y = df['Personality']
X_train_random, X_test_random, y_train_random, y_test_random = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = X
X_test = df_test.iloc[:, 1:]
y_train = y
X_test.head()



Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency
0,3.0,No,7.0,4.0,No,6.0,
1,,Yes,0.0,0.0,Yes,5.0,1.0
2,3.0,No,5.0,6.0,No,15.0,9.0
3,3.0,No,4.0,4.0,No,5.0,6.0
4,9.0,Yes,1.0,2.0,Yes,1.0,1.0


# Data Preprocessing

## Pipelines for preprocessing


In [8]:
cat_pipeline = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OneHotEncoder())])
num_pipeline = Pipeline([('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler())])
y_pipeline = Pipeline([('encoder', OneHotEncoder())])


## Features types

In [9]:
cat_features = ['Drained_after_socializing', 'Stage_fear']
num_features = ['Time_spent_Alone', 'Social_event_attendance', 'Going_outside', 'Friends_circle_size', 'Post_frequency']

## Preprocessor

In [10]:
preprocessor = ColumnTransformer([('cat', cat_pipeline, cat_features), ('num', num_pipeline, num_features)])

# Pipeline

In [11]:
pipeline = Pipeline([('preprocessor', preprocessor), ('classifier', svm.SVC(kernel='linear', class_weight='balanced'))])


## GridSearch

In [None]:
param_grid = {
    'classifier__C': [0.01, 0.1, 1, 10],
}
grid = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid.fit(X_train, y_train)


# Metrics

In [198]:
crv = cross_val_predict(grid.best_estimator_, X, y, cv=5)
print(classification_report(y, crv))

              precision    recall  f1-score   support

   Extrovert       0.98      0.98      0.98     13699
   Introvert       0.94      0.93      0.94      4825

    accuracy                           0.97     18524
   macro avg       0.96      0.96      0.96     18524
weighted avg       0.97      0.97      0.97     18524



# Submission

In [199]:
df_submission = pd.read_csv('sample_submission.csv', index_col=False)
df_submission['Personality'] = grid.best_estimator_.predict(X_test)
df_submission.columns = ['id', 'Personality']
df_submission.to_csv('submission.csv', index=False)