### Feature: Physical Health and Fitness + Demographics + Behavior

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [2]:
import os
train_path = '/kaggle/input/child-mind-institute-problematic-internet-use/train.csv'
test_path = '/kaggle/input/child-mind-institute-problematic-internet-use/test.csv'


if os.path.exists(train_path) and os.path.exists(test_path):
	train_df = pd.read_csv(train_path)
	test_df = pd.read_csv(test_path)
else:
	print("One or both files do not exist.")
    


3.Data Preprocessing

### Process PCIAT_Total score

In [3]:
train_cols = set(train_df.columns)
test_cols = set(test_df.columns)
columns_not_in_test = sorted(list(train_cols - test_cols))

columns_to_exclude = ['PCIAT-PCIAT_Total', 'PCIAT-Season', 'sii']
question_columns = [
    col for col in columns_not_in_test if col not in columns_to_exclude
]

question_columns

['PCIAT-PCIAT_01',
 'PCIAT-PCIAT_02',
 'PCIAT-PCIAT_03',
 'PCIAT-PCIAT_04',
 'PCIAT-PCIAT_05',
 'PCIAT-PCIAT_06',
 'PCIAT-PCIAT_07',
 'PCIAT-PCIAT_08',
 'PCIAT-PCIAT_09',
 'PCIAT-PCIAT_10',
 'PCIAT-PCIAT_11',
 'PCIAT-PCIAT_12',
 'PCIAT-PCIAT_13',
 'PCIAT-PCIAT_14',
 'PCIAT-PCIAT_15',
 'PCIAT-PCIAT_16',
 'PCIAT-PCIAT_17',
 'PCIAT-PCIAT_18',
 'PCIAT-PCIAT_19',
 'PCIAT-PCIAT_20']

#### Hàm tính toán lại điểm sii theo PCIAT-Total

In [10]:
def recalculate_sii(row):
    if pd.isna(row['PCIAT-PCIAT_Total']):
        return np.nan
    max_possible = row['PCIAT-PCIAT_Total'] + row[question_columns].isna().sum() * 5
    if row['PCIAT-PCIAT_Total'] <= 30 and max_possible <= 30:
        return 0
    elif 31 <= row['PCIAT-PCIAT_Total'] <= 49 and max_possible <= 49:
        return 1
    elif 50 <= row['PCIAT-PCIAT_Total'] <= 79 and max_possible <= 79:
        return 2
    elif row['PCIAT-PCIAT_Total'] >= 80 and max_possible >= 80:
        return 3
    return np.nan

train_df['recalc_sii'] = train_df.apply(recalculate_sii, axis=1)

In [11]:
mismatch_rows = train_df[
    (train_df['recalc_sii'] != train_df['sii']) & train_df['sii'].notna()
]

mismatch_rows[question_columns + ['recalc_sii']].style.map(
    lambda x: 'background-color: #FFC0CB' if pd.isna(x) else ''
)

Unnamed: 0,PCIAT-PCIAT_01,PCIAT-PCIAT_02,PCIAT-PCIAT_03,PCIAT-PCIAT_04,PCIAT-PCIAT_05,PCIAT-PCIAT_06,PCIAT-PCIAT_07,PCIAT-PCIAT_08,PCIAT-PCIAT_09,PCIAT-PCIAT_10,PCIAT-PCIAT_11,PCIAT-PCIAT_12,PCIAT-PCIAT_13,PCIAT-PCIAT_14,PCIAT-PCIAT_15,PCIAT-PCIAT_16,PCIAT-PCIAT_17,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,recalc_sii


In [12]:
train_df['sii'] = train_df['recalc_sii']
train_df = train_df.drop(mismatch_rows.index)

train_df[columns_not_in_test + ['recalc_sii']]

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,PCIAT-PCIAT_01,PCIAT-PCIAT_02,PCIAT-PCIAT_03,PCIAT-PCIAT_04,PCIAT-PCIAT_05,PCIAT-PCIAT_06,PCIAT-PCIAT_07,PCIAT-PCIAT_08,PCIAT-PCIAT_09,PCIAT-PCIAT_10,...,PCIAT-PCIAT_15,PCIAT-PCIAT_16,PCIAT-PCIAT_17,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,PCIAT-Season,sii,recalc_sii
0,5.0,4.0,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,...,4.0,4.0,4.0,4.0,2.0,4.0,55.0,Fall,2.0,2.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Fall,0.0,0.0
2,5.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,...,1.0,0.0,2.0,2.0,1.0,1.0,28.0,Fall,0.0,0.0
3,4.0,2.0,4.0,0.0,5.0,1.0,0.0,3.0,2.0,2.0,...,0.0,3.0,4.0,3.0,4.0,1.0,44.0,Summer,1.0,1.0
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3955,3.0,3.0,3.0,2.0,3.0,2.0,2.0,2.0,2.0,1.0,...,1.0,0.0,2.0,1.0,1.0,0.0,32.0,Winter,1.0,1.0
3956,,,,,,,,,,,...,,,,,,,,,,
3957,5.0,5.0,3.0,0.0,5.0,1.0,0.0,2.0,0.0,2.0,...,0.0,0.0,1.0,1.0,0.0,1.0,31.0,Winter,1.0,1.0
3958,2.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,...,2.0,1.0,1.0,1.0,1.0,1.0,19.0,Spring,0.0,0.0


In [13]:
na_total_rows = train_df[train_df['sii'].isna()]
na_total_rows

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,...,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii,recalc_sii
4,0016bb22,Spring,18,1,Summer,,,,,,...,,,,,,,,,,
7,0068a485,Fall,10,1,,,Fall,16.861286,59.25,84.2,...,,,,,,,Fall,2.0,,
8,0069fbed,Summer,15,0,,,Spring,,,,...,,,,,,,Summer,2.0,,
9,0083e397,Summer,19,1,Summer,,,,,,...,,,,,,,,,,
10,0087dd65,Spring,11,1,,,,,,,...,,,,,,,Spring,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3943,fe7c87e2,Spring,13,0,Summer,,Summer,,,,...,,,,,,,,,,
3944,fe7f68a7,Spring,15,1,,,Spring,22.457960,62.00,122.8,...,,,,Spring,49.0,68.0,Spring,2.0,,
3950,ff0ab367,Spring,9,0,,,Spring,20.200490,52.50,79.2,...,,,,,,,Spring,0.0,,
3956,ffa9794a,Winter,10,0,,,Spring,18.764678,53.50,76.4,...,,,,,,,Winter,0.0,,


In [14]:
train_df = train_df.dropna(subset=['PCIAT-PCIAT_Total'])
train_df

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,...,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii,recalc_sii
0,00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,...,2.0,4.0,55.0,,,,Fall,3.0,2.0,2.0
1,000fd460,Summer,9,0,,,Fall,14.035590,48.0,46.0,...,0.0,0.0,0.0,Fall,46.0,64.0,Summer,0.0,0.0,0.0
2,00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,...,1.0,1.0,28.0,Fall,38.0,54.0,Summer,2.0,0.0,0.0
3,00115b9f,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,...,4.0,1.0,44.0,Summer,31.0,45.0,Winter,0.0,1.0,1.0
5,001f3379,Spring,13,1,Winter,50.0,Summer,22.279952,59.5,112.2,...,2.0,1.0,34.0,Summer,40.0,56.0,Spring,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3953,ff6c2bb8,Fall,8,0,,,Fall,17.139810,52.5,67.2,...,2.0,1.0,22.0,Fall,41.0,58.0,Fall,2.0,0.0,0.0
3954,ff759544,Summer,7,1,,,Summer,13.927006,48.5,46.6,...,3.0,0.0,33.0,Summer,48.0,67.0,Summer,0.0,1.0,1.0
3955,ff8a2de4,Fall,13,0,Spring,60.0,Fall,16.362460,59.5,82.4,...,1.0,0.0,32.0,Winter,35.0,50.0,Fall,1.0,1.0,1.0
3957,ffcd4dbd,Fall,11,0,Spring,68.0,Winter,21.441500,60.0,109.8,...,0.0,1.0,31.0,Winter,56.0,77.0,Fall,0.0,1.0,1.0


In [15]:
for column in question_columns:
    if train_df[column].isna().any():
        mode_value = train_df[column].mode()[0]
        train_df[column] = train_df[column].fillna(mode_value)

train_df[columns_not_in_test + ['recalc_sii']]

Unnamed: 0,PCIAT-PCIAT_01,PCIAT-PCIAT_02,PCIAT-PCIAT_03,PCIAT-PCIAT_04,PCIAT-PCIAT_05,PCIAT-PCIAT_06,PCIAT-PCIAT_07,PCIAT-PCIAT_08,PCIAT-PCIAT_09,PCIAT-PCIAT_10,...,PCIAT-PCIAT_15,PCIAT-PCIAT_16,PCIAT-PCIAT_17,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,PCIAT-Season,sii,recalc_sii
0,5.0,4.0,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,...,4.0,4.0,4.0,4.0,2.0,4.0,55.0,Fall,2.0,2.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Fall,0.0,0.0
2,5.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,...,1.0,0.0,2.0,2.0,1.0,1.0,28.0,Fall,0.0,0.0
3,4.0,2.0,4.0,0.0,5.0,1.0,0.0,3.0,2.0,2.0,...,0.0,3.0,4.0,3.0,4.0,1.0,44.0,Summer,1.0,1.0
5,3.0,3.0,3.0,0.0,2.0,1.0,0.0,2.0,2.0,1.0,...,2.0,1.0,3.0,1.0,2.0,1.0,34.0,Summer,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3953,3.0,3.0,3.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,3.0,0.0,2.0,2.0,1.0,22.0,Fall,0.0,0.0
3954,1.0,3.0,3.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.0,3.0,3.0,3.0,0.0,33.0,Summer,1.0,1.0
3955,3.0,3.0,3.0,2.0,3.0,2.0,2.0,2.0,2.0,1.0,...,1.0,0.0,2.0,1.0,1.0,0.0,32.0,Winter,1.0,1.0
3957,5.0,5.0,3.0,0.0,5.0,1.0,0.0,2.0,0.0,2.0,...,0.0,0.0,1.0,1.0,0.0,1.0,31.0,Winter,1.0,1.0


In [None]:
train_df.drop(columns='recalc_sii', inplace=True)

train_df[columns_not_in_test]

In [19]:
def convert_season_to_numeric(df, season_columns):
    # Định nghĩa mapping thứ tự cho các mùa
    season_mapping = {
        'Spring': 0,
        'Summer': 1,
        'Fall': 2,
        'Winter': 3
    }
    
    # Kiểm tra từng cột trong danh sách
    for col in season_columns:
        if col in df.columns:
            # In ra các giá trị trước khi ánh xạ
            print(f"Giá trị trước khi ánh xạ trong cột {col}:")
            print(df[col].unique())
            
            # Áp dụng mapping
            df[col] = df[col].map(season_mapping)
            
            # In ra các giá trị sau khi ánh xạ
            print(f"Giá trị sau khi ánh xạ trong cột {col}:")
            print(df[col].unique())
    
    return df

In [20]:
season_columns = [
    'Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 
    'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 
    'PAQ_A-Season', 'PAQ_C-Season',  'SDS-Season', 
    'PreInt_EduHx-Season'
]

# Áp dụng hàm cho tập train và test
test_df = convert_season_to_numeric(test_df, season_columns)

# Kết quả
print("Test DataFrame sau khi chuyển đổi:")
print(test_df[season_columns].head())

train_df = convert_season_to_numeric(train_df, season_columns)

# Kết quả
print("Train DataFrame sau khi chuyển đổi:")
print(train_df[season_columns].head())

Giá trị trước khi ánh xạ trong cột Basic_Demos-Enroll_Season:
['Fall' 'Summer' 'Winter' 'Spring']
Giá trị sau khi ánh xạ trong cột Basic_Demos-Enroll_Season:
[2 1 3 0]
Giá trị trước khi ánh xạ trong cột CGAS-Season:
['Winter' nan 'Fall' 'Summer' 'Spring']
Giá trị sau khi ánh xạ trong cột CGAS-Season:
[ 3. nan  2.  1.  0.]
Giá trị trước khi ánh xạ trong cột Physical-Season:
['Fall' 'Summer' nan 'Spring' 'Winter']
Giá trị sau khi ánh xạ trong cột Physical-Season:
[ 2.  1. nan  0.  3.]
Giá trị trước khi ánh xạ trong cột Fitness_Endurance-Season:
[nan 'Fall' 'Summer' 'Spring']
Giá trị sau khi ánh xạ trong cột Fitness_Endurance-Season:
[nan  2.  1.  0.]
Giá trị trước khi ánh xạ trong cột FGC-Season:
['Fall' 'Summer' nan 'Spring' 'Winter']
Giá trị sau khi ánh xạ trong cột FGC-Season:
[ 2.  1. nan  0.  3.]
Giá trị trước khi ánh xạ trong cột BIA-Season:
['Fall' 'Winter' nan 'Summer']
Giá trị sau khi ánh xạ trong cột BIA-Season:
[ 2.  3. nan  1.]
Giá trị trước khi ánh xạ trong cột PAQ_A-Season:

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


In [21]:
feature_cols = [
    'Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex', 'CGAS-Season', 
    'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI', 'Physical-Height', 
    'Physical-Weight', 'Physical-Waist_Circumference', 'Physical-Diastolic_BP', 
    'Physical-HeartRate', 'Physical-Systolic_BP', 'Fitness_Endurance-Season', 
    'Fitness_Endurance-Max_Stage', 'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec', 
    'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND', 'FGC-FGC_GSND_Zone', 
    'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU', 'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 
    'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR', 'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 
    'BIA-Season', 'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI', 'BIA-BIA_BMR', 
    'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM', 'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 
    'BIA-BIA_Frame_num', 'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM', 'BIA-BIA_TBW', 
    'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season', 'PAQ_C-PAQ_C_Total', 'SDS-Season', 
    'SDS-SDS_Total_Raw', 'SDS-SDS_Total_T', 'PreInt_EduHx-Season', 
    'PreInt_EduHx-computerinternet_hoursday'
]
# Loại bỏ các hàng có giá trị NaN trong y
train_df = train_df.dropna(subset=['sii'])
X = train_df[feature_cols]
y = train_df['sii']

# Define the preprocessing pipeline
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, feature_cols)
])

# Fit and transform X
preprocessor.fit(X)
X = pd.DataFrame(preprocessor.transform(X), columns=feature_cols)


In [23]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.2)

In [24]:
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score


# Random seed
seed = 2023

# List of models
models = [
    LinearSVC(max_iter=10000, random_state=seed),
    SVC(random_state=seed),
    KNeighborsClassifier(metric='minkowski', p=2),
    LogisticRegression(solver='liblinear', max_iter=1000),
    DecisionTreeClassifier(random_state=seed),
    RandomForestClassifier(random_state=seed),
    ExtraTreesClassifier(random_state=seed),
    AdaBoostClassifier(random_state=seed),
    XGBClassifier(eval_metric='logloss', random_state=seed)    
]

# Function to generate baseline results
def generate_baseline_results(models, X, y, metrics='accuracy', cv=5, plot_results=False):
    # Define k-fold
    kfold = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
    entries = []
    
    # Loop through each model
    for model in models:
        model_name = model.__class__.__name__
        print(f"Training: {model_name}")
        scores = cross_val_score(model, X, y, scoring=metrics, cv=kfold)
        # Lưu kết quả của tất cả các mô hình vào entries
        entries.extend([(model_name, fold_idx, score) for fold_idx, score in enumerate(scores)])
    
    # Create DataFrame
    cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_id', 'accuracy_score'])
    
    # Optional: Plot results if specified
    if plot_results:
        sns.boxplot(x='model_name', y='accuracy_score', data=cv_df, color='lightblue', showmeans=True)
        plt.title("Boxplot of baseline Model Accuracy using 5-fold cross-validation")
        plt.xticks(rotation=45)
        plt.show()
    
    # Summary result
    mean = cv_df.groupby('model_name')['accuracy_score'].mean()
    std = cv_df.groupby('model_name')['accuracy_score'].std()

    baseline_results = pd.concat([mean, std], axis=1)
    baseline_results.columns = ['Mean', 'Standard Deviation']

    # Sort results
    baseline_results.sort_values(by='Mean', ascending=False, inplace=True)

    return baseline_results

# Chạy hàm và hiển thị kết quả
cv_results = generate_baseline_results(models, X, y, metrics='accuracy', cv=5, plot_results=False)

# In toàn bộ kết quả
print(cv_results)


Training: LinearSVC




Training: SVC
Training: KNeighborsClassifier
Training: LogisticRegression
Training: DecisionTreeClassifier
Training: RandomForestClassifier
Training: ExtraTreesClassifier
Training: AdaBoostClassifier
Training: XGBClassifier
                            Mean  Standard Deviation
model_name                                          
LogisticRegression      0.607943            0.006522
SVC                     0.607576            0.011691
LinearSVC               0.606469            0.016104
RandomForestClassifier  0.598016            0.005074
ExtraTreesClassifier    0.596174            0.007932
XGBClassifier           0.578524            0.016169
KNeighborsClassifier    0.567495            0.011687
AdaBoostClassifier      0.547584            0.068440
DecisionTreeClassifier  0.499455            0.019129


8.Evaluate the Model

In [26]:
# Preprocess the test data
X_test = test_df[feature_cols]
X_test = pd.DataFrame(preprocessor.transform(X_test), columns=feature_cols)



# Use the trained model to make predictions
best_model =  LogisticRegression(solver='liblinear', max_iter=1000)
best_model.fit(X_train, y_train)
y_test_pred = best_model.predict(X_test)

# Create a submission DataFrame
submission = pd.DataFrame({
    'id': test_df['id'],
    'sii': y_test_pred
})

# Save the submission DataFrame to a CSV file
submission.to_csv('submission.csv', index=False)

print("Submission file created successfully.")  

Submission file created successfully.
