In [1]:
# Data Loading and Preprocessing
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load dataset
df=pd.read_csv(r'G:\smoking_driking_dataset_Ver01.csv')

# Remove missing values
df.dropna(inplace=True)

# Separate features and target variable
X = df.drop('DRK_YN', axis=1)
y = df['DRK_YN']

# Define categorical and numerical columns
categorical_cols = [col for col in X.columns if X[col].dtype == 'object']
numerical_cols = [col for col in X.columns if X[col].dtype != 'object']

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])

# Append classifier to preprocessing pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier(random_state=42))])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
clf.fit(X_train, y_train)

# Evaluation Metrics
def evaluate(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    return acc, precision, recall, f1

# Performance Evaluation
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
acc_train, precision_train, recall_train, f1_train = evaluate(y_train, y_pred_train)
acc_test, precision_test, recall_test, f1_test = evaluate(y_test, y_pred_test)

print("Performance on training set:")
print(f"Accuracy: {acc_train:.2f}, Precision: {precision_train:.2f}, Recall: {recall_train:.2f}, F1-score: {f1_train:.2f}")
print("Performance on test set:")
print(f"Accuracy: {acc_test:.2f}, Precision: {precision_test:.2f}, Recall: {recall_test:.2f}, F1-score: {f1_test:.2f}")

# Feature Importance Analysis
importances = clf.named_steps['classifier'].feature_importances_
feature_names = numerical_cols + list(clf.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names(categorical_cols))
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
print("\nFeature Importance:")
print(feature_importance_df.sort_values(by='Importance', ascending=False))


KeyboardInterrupt: 