<a href="https://colab.research.google.com/github/inspire007/KaggleCompetitions/blob/main/Predict_the_Introverts_from_the_Extroverts/Predict_the_Introverts_from_the_Extroverts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

data = pd.read_csv('train.csv')
data_test = pd.read_csv('test.csv')

X = data.iloc[:, 1:-1]
X_test = data_test.iloc[:, 1:]
y = data.iloc[:, -1].values
le = LabelEncoder()
y = le.fit_transform(y)

num_cols = make_column_selector(dtype_include='number')
cat_cols = make_column_selector(dtype_include='object')

num_pipe = Pipeline([
    ('num', SimpleImputer(strategy='mean')),
    ('std', StandardScaler())
])

cat_pipe = Pipeline([
    ('cat', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder())
])

ct = ColumnTransformer([
    ('num_c', num_pipe, num_cols),
    ('cat_c', cat_pipe, cat_cols)
], remainder='passthrough')

X = ct.fit_transform(X)
X_test = ct.transform(X_test)

models = {
    'SVC': SVC(kernel='rbf'),
    'randomF': RandomForestClassifier(),
    'tree': DecisionTreeClassifier(),
    'gnb': GaussianNB(),
    'logR': LogisticRegression(),
    'xgb': XGBClassifier()
}

max_acc = 0
selected_model = False


for name, model in models.items():
  cv = cross_val_score(estimator=model, X = X, y = y, cv = 10)
  mean = cv.mean()
  std = cv.std()
  print(f'Model {name}: Mean: {mean}, Std: {std}\n')
  if mean > max_acc:
    selected_model = model
    max_acc = mean

print(f'Model selected based on accuracy: {selected_model}\n')

#parameter tuning
#params = [
    #{'kernel': ['linear'], 'C': list(np.arange(0.25, 1, .25))},
    #{'kernel': ['rbf'], 'C': list(np.arange(0.25, 1, .25)), 'gamma': list(np.arange(0.1,1,0.1)) }
#]

params = [
    {'C': list(np.arange(0.25, 1, 0.25))}
]

gcv = GridSearchCV(estimator=LogisticRegression(), param_grid=params, scoring='accuracy', cv = 3, n_jobs=-1)
gcv.fit(X,y)
best_acc = gcv.best_score_
best_params = gcv.best_params_

print(f'Best accuracy based on GridSearch: {best_acc}\n')
print(f'Best params: {best_params}\n')

model = LogisticRegression(C=0.75)
model.fit(X,y)
y_predict = le.inverse_transform(model.predict(X_test))
pid = data_test['id']

np.savetxt('output.csv', np.column_stack((pid,y_predict)), header='id,Personality', delimiter=',', fmt=['%d','%s'], comments='')

Model SVC: Mean: 0.9685271039083199, Std: 0.0037350018442881042

Model randomF: Mean: 0.966583813068295, Std: 0.003626276795830906

Model tree: Mean: 0.9339229828694113, Std: 0.004703715102801197

Model gnb: Mean: 0.9685270456291182, Std: 0.0036486232524324414

Model logR: Mean: 0.9686351244086119, Std: 0.0037919606873287374

Model xgb: Mean: 0.9677171395635356, Std: 0.003857341767865289

Model selected based on accuracy: LogisticRegression()

Best accuracy based on GridSearch: 0.9687433829646505

Best params: {'C': np.float64(0.75)}

