In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler


In [2]:
# Loading the data
df = pd.read_csv('/content/framingham.csv')

In [3]:
# Drop the rows with missing values for simplicity
df = df.dropna()

# Split the dataset into features (X) and target variable (y)
X = df.drop('TenYearCHD', axis=1)
y = df['TenYearCHD']

In [4]:
# Splitting  the data into training set  and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features by scaling them
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [5]:
# Define the Random Forest classifier
random_forest = RandomForestClassifier(random_state=42)

# Define the grid of hyperparameters to search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],  # Reducing the maximum depth of the trees
    'min_samples_split': [5, 10, 15],  # Increasing the minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4]  # Increasing the minimum number of samples required to be at a leaf node
}


In [6]:
# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=random_forest, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters and the best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Accuracy Score:", best_score)

Best Parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 100}
Best Accuracy Score: 0.8526997462151046


In [8]:
# Use the best model for prediction
best_random_forest = grid_search.best_estimator_
y_pred = best_random_forest.predict(X_test_scaled)

# Calculate accuracy on testing data
test_accuracy = accuracy_score(y_test, y_pred)
print("Testing Accuracy:", test_accuracy)

Testing Accuracy: 0.8346994535519126


In [9]:
 #Fit the model to the training data
best_random_forest.fit(X_train_scaled, y_train)

# Predictions on training data
y_train_pred = best_random_forest.predict(X_train_scaled)

# Calculate accuracy on training data
train_accuracy = accuracy_score(y_train, y_train_pred)

print("Training Accuracy:", train_accuracy)

Training Accuracy: 0.8735475051264525


In [10]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Define the XGBoost classifier
xgb_classifier = XGBClassifier(random_state=42)

# Define the grid of hyperparameters to search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'learning_rate': [0.01, 0.1, 0.2],  # Add learning rate as a hyperparameter
    'min_child_weight': [1, 2, 4],  # Add min_child_weight as a hyperparameter
    'subsample': [0.5, 0.7, 1.0],  # Add subsample as a hyperparameter
    'colsample_bytree': [0.5, 0.7, 1.0]  # Add colsample_bytree as a hyperparameter
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters and the best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Accuracy Score:", best_score)

# Use the best model for prediction
best_xgb_classifier = grid_search.best_estimator_
y_pred = best_xgb_classifier.predict(X_test_scaled)

# Calculate accuracy on testing data
test_accuracy = accuracy_score(y_test, y_pred)
print("Testing Accuracy:", test_accuracy)

# Fit the model to the training data
best_xgb_classifier.fit(X_train_scaled, y_train)

# Predictions on training data
y_train_pred = best_xgb_classifier.predict(X_train_scaled)

# Calculate accuracy on training data
train_accuracy = accuracy_score(y_train, y_train_pred)

print("Training Accuracy:", train_accuracy)


Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': 5, 'min_child_weight': 4, 'n_estimators': 200, 'subsample': 0.7}
Best Accuracy Score: 0.8544073976838481
Testing Accuracy: 0.8360655737704918
Training Accuracy: 0.8615857826384142
