In [7]:
from ast import mod
import numpy as np
import pandas as pd
from time import time
import sklearn.preprocessing
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import ParameterGrid


x_train = pd.read_csv("train.csv")
x_test = pd.read_csv("test.csv")
y_train = pd.read_csv("labels.csv")


#x_train = x_train.sample(frac=0.5, random_state=42)
#y_train = y_train.loc[x_train.index]


x_train, x_test = x_train.drop(columns=['Age_Group']), x_test.drop(columns=['Age_Group'])


def feature_encoding(X):

    non_numerical_columns_names = X.select_dtypes(exclude=['number']).columns

    for column in non_numerical_columns_names:
        le = LabelEncoder()
        X[column] = le.fit_transform(X[column])

    return X

def normalize_features(X_train, X_test):

    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
    X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

    return X_train_scaled, X_test_scaled


x_train, x_test = feature_encoding(x_train), feature_encoding(x_test)

x_train_scaled, x_test_scaled = normalize_features(x_train, x_test)


pca = PCA(n_components=15)


x_train_scaled = pca.fit_transform(x_train_scaled, y=None)
x_train_scaled = pd.DataFrame(
    x_train_scaled,
    columns=[f'PC{i+1}' for i in range(pca.n_components_)],
    index=x_train.index)


x_test_scaled = pca.transform(x_test_scaled)
x_test_scaled = pd.DataFrame(
    x_test_scaled,
    columns=[f'PC{i+1}' for i in range(pca.n_components_)],
    index=x_test.index
)



if torch.cuda.is_available():
    device=torch.device("cuda")
else:
    device=torch.device("cpu")

X_train=torch.tensor(x_train_scaled.to_numpy(),dtype=torch.float32).to(device)
Y_train = torch.tensor(y_train["Diabetes_binary"].to_numpy(), dtype=torch.long).to(device)
X_test=torch.tensor(x_test_scaled.to_numpy(),dtype=torch.float32).to(device)


train_dataset=TensorDataset(X_train,Y_train)
train_loader=DataLoader(train_dataset,batch_size=32,shuffle=True)

class MLP(nn.Module):
  def __init__(self, input_size, hidden_size):
        super(MLP, self).__init__()
        layers=[]
        for i in range(len(hidden_size)):
          layers.append(nn.Linear(input_size, hidden_size[i]))
          layers.append(nn.ReLU())
          input_size=hidden_size[i]

        layers.append(nn.Linear(hidden_size[-1], len(np.unique(y_train))))
        self.layers = nn.Sequential(*layers)

  def forward(self, x):
        return self.layers(x)



param={
    'hidden_size':[128,64,32],
    'epochs':25,
    'learning_rate':0.001,
    'batch_size':32,
    'optimizer':'adam'
}


model=MLP(X_train.shape[1],param['hidden_size'],).to(device)

optimizer=optim.Adam(model.parameters(),lr=param['learning_rate'])
criterion=nn.CrossEntropyLoss()


for epoch in range(param['epochs']):

  model.train()
  total_loss=0.0
  with tqdm(train_loader,unit="batch") as tepoch:
    for x,y in tepoch:
      optimizer.zero_grad()
      output=model(x)
      loss=criterion(output,y)
      loss.backward()
      optimizer.step()
      tepoch.set_description(f"Epoch {epoch+1}/{param['epochs']}")
      total_loss+=loss.item()
      tepoch.set_postfix(loss=total_loss/(tepoch.n+1))



  model.eval()
  pred=[]
  with torch.no_grad():
    for x in X_test:
      x = x.unsqueeze(0)
      output=model(x)
      _,predicted=torch.max(output,1)
      pred.extend(predicted.cpu().numpy())

y_test_pred = pd.DataFrame(pred, columns=['Diabetes_binary'], index=x_test['Unnamed: 0'])

y_test_pred.index.name = 'index'

y_test_pred.to_csv("test_predictions.csv", index=True)









Epoch 1/2: 100%|██████████| 6342/6342 [01:47<00:00, 58.82batch/s, loss=0.421]
Epoch 2/2: 100%|██████████| 6342/6342 [01:49<00:00, 57.89batch/s, loss=0.326]
