In [None]:
import numpy as np
import pandas as pd
from time import time
import sklearn.preprocessing
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV



x_train = pd.read_csv("train.csv")
x_test = pd.read_csv("test.csv")
y_train = pd.read_csv("labels.csv")


#x_train = x_train.sample(frac=0.5, random_state=42)
#y_train = y_train.loc[x_train.index]


x_train, x_test = x_train.drop(columns=['Age_Group']), x_test.drop(columns=['Age_Group'])


def feature_encoding(X):

    non_numerical_columns_names = X.select_dtypes(exclude=['number']).columns

    for column in non_numerical_columns_names:
        le = LabelEncoder()
        X[column] = le.fit_transform(X[column])

    return X

def normalize_features(X_train, X_test):

    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
    X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

    return X_train_scaled, X_test_scaled

x_train, x_test = feature_encoding(x_train), feature_encoding(x_test)

print(x_test.head())

x_train_scaled, x_test_scaled = normalize_features(x_train, x_test)


pca = PCA(n_components=15)


x_train_scaled = pca.fit_transform(x_train_scaled, y=None)
x_train_scaled = pd.DataFrame(
    x_train_scaled,
    columns=[f'PC{i+1}' for i in range(pca.n_components_)],
    index=x_train.index)


print(y_train.shape)
print(y_train.head())

param_grid_random_forest = {
    'n_estimators': [20, 30],
    'max_depth': [None, 20,10, 5],
    'bootstrap': [True] 
}

def perform_grid_search(model, X_train, Y_train, params):
    
    strat_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    
    grid_search = GridSearchCV(model, param_grid=params, scoring='accuracy', cv=strat_kfold)
    
    grid_search.fit(X_train, Y_train)


    best_param = grid_search.best_params_ 
    best_score = grid_search.best_score_ 

    print("Best parameters are:", best_param)
    print("Best score is:", best_score)

    
    return grid_search, best_param, best_score
"""
cls = RandomForestClassifier(random_state=42)

grid_search_rf, best_param_rf, best_score_rf = perform_grid_search(cls, x_train_scaled, y_train['Diabetes_binary'], params=param_grid_random_forest)
"""
'''strat_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold_accuracies = []

for train_index, test_index in strat_kfold.split(x_train_scaled, y_train['Diabetes_binary']):
    
    X_train_fold, X_val_fold = x_train_scaled.iloc[train_index], x_train_scaled.iloc[test_index]
    y_train_fold, y_val_fold = y_train['Diabetes_binary'].iloc[train_index], y_train['Diabetes_binary'].iloc[test_index]
    
  
    cls.fit(X_train_fold, y_train_fold)
    
    
    y_pred = cls.predict(X_val_fold)
    
    fold_accuracy = accuracy_score(y_val_fold, y_pred)
    fold_accuracies.append(fold_accuracy)

print(f"Cross-Validation Accuracies: {fold_accuracies}")'''
cls_final = RandomForestClassifier(n_estimators=30, max_depth=10, bootstrap=True, random_state=42)

cls_final.fit(x_train_scaled, y_train['Diabetes_binary'])

x_test_scaled = pca.transform(x_test_scaled)
x_test_scaled = pd.DataFrame(
    x_test_scaled,
    columns=[f'PC{i+1}' for i in range(pca.n_components_)],
    index=x_test.index
)


y_test_pred = cls_final.predict(x_test_scaled)

print(y_test_pred.shape)

y_test_pred = pd.DataFrame(y_test_pred, columns=['Diabetes_binary'], index=x_test['Unnamed: 0'])

y_test_pred.index.name = 'index'

y_test_pred.to_csv("test_predictions.csv", index=True)



'''
train_f1 = f1_score(y_train["Diabetes_binary"], y_train_pred[:,1])

confusion_matrix_train = confusion_matrix(y_train["Diabetes_binary"], y_train_pred[:,1])

#print(f"Training score for the model is {train_f1} and the confusion matrix is {confusion_matrix_train}")

df_output=pd.DataFrame(y_test_pred, columns=['index',"Diabetes_binary"])

df_output.to_csv('./submission.csv', index=False)'''

