In [None]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
%matplotlib inline

In [None]:
# Create a model class
class Model(nn.Module):

    def __init__(self, in_features=5, h1=30, h2=25):
        super().__init__()
        self.fc1 = nn.Linear(in_features, h1)
        self.fc2 = nn.Linear(h1, h2)
        self.out = nn.Linear(h2, 1)  # Output layer with just 1 neuron

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = torch.sigmoid(self.out(x))  # Sigmoid activation function for binary classification
        return x

In [None]:
# Load data
data = pd.read_csv("diabetes.csv")

In [None]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
# number of rows and Columns in this dataset
data.shape

(768, 9)

In [None]:
# getting the statistical measures of the data
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [None]:
data.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


**Feautre Selection**

In [None]:
#linear relationship
import pandas as pd
import numpy as np

# Assuming 'data' is your DataFrame containing the features and target variable
# Replace 'data' with your actual DataFrame name
correlation_matrix = data.corr()

# Assuming 'Outcome' is your target variable
# Replace 'Outcome' with your actual target variable name
correlation_with_target = correlation_matrix['Outcome'].abs().sort_values(ascending=False)

print("Correlation of features with the target variable (absolute values):")
print(correlation_with_target)


Correlation of features with the target variable (absolute values):
Outcome                     1.000000
Glucose                     0.466581
BMI                         0.292695
Age                         0.238356
Pregnancies                 0.221898
DiabetesPedigreeFunction    0.173844
Insulin                     0.130548
SkinThickness               0.074752
BloodPressure               0.065068
Name: Outcome, dtype: float64


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# Assuming 'data' is your DataFrame containing the features and target variable
# Replace 'data' with your actual DataFrame name
X = data.drop('Outcome', axis=1)  # Assuming 'Outcome' is the target variable
y = data['Outcome']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Train a Gradient Boosting model
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)

# Get feature importance scores from both models
rf_feature_importances = rf_model.feature_importances_
gb_feature_importances = gb_model.feature_importances_

# Combine feature importance scores from both models
feature_importances = pd.DataFrame({
    'Feature': X_train.columns,
    'RandomForest Importance': rf_feature_importances,
    'GradientBoosting Importance': gb_feature_importances
})

# Sort features by importance scores
feature_importances = feature_importances.sort_values(by=['RandomForest Importance', 'GradientBoosting Importance'], ascending=False)

# Print feature importance scores
print("Feature Importance Scores:")
print(feature_importances)

# Select top features based on importance scores
top_features = feature_importances['Feature'][:5]  # Select top 5 features, you can adjust this value

# Filter training and testing sets to include only the top features
X_train_selected = X_train[top_features]
X_test_selected = X_test[top_features]

# Train and evaluate models on selected features
# (You can use the rf_model and gb_model objects created earlier)
# Example:
# rf_model.fit(X_train_selected, y_train)
# y_pred_rf = rf_model.predict(X_test_selected)
# gb_model.fit(X_train_selected, y_train)
# y_pred_gb = gb_model.predict(X_test_selected)
# Evaluate model performance
# accuracy_rf = accuracy_score(y_test, y_pred_rf)
# accuracy_gb = accuracy_score(y_test, y_pred_gb)
# print("Random Forest Accuracy:", accuracy_rf)
# print("Gradient Boosting Accuracy:", accuracy_gb)


Feature Importance Scores:
                    Feature  RandomForest Importance  \
1                   Glucose                 0.258864   
5                       BMI                 0.169984   
7                       Age                 0.140931   
6  DiabetesPedigreeFunction                 0.123768   
2             BloodPressure                 0.088134   
0               Pregnancies                 0.076551   
4                   Insulin                 0.076122   
3             SkinThickness                 0.065646   

   GradientBoosting Importance  
1                     0.389621  
5                     0.185406  
7                     0.156107  
6                     0.112469  
2                     0.049059  
0                     0.029574  
4                     0.053652  
3                     0.024113  


In [None]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Assuming 'data' is your DataFrame containing the features and target variable
# Replace 'data' with your actual DataFrame name
X = data.drop('Outcome', axis=1)  # Assuming 'Outcome' is the target variable
y = data['Outcome']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform univariate feature selection using chi-square test for categorical features
# Select the top k features
k_chi2 = 5  # Adjust the number of features to select
selector_chi2 = SelectKBest(score_func=chi2, k=k_chi2)
X_train_chi2 = selector_chi2.fit_transform(X_train, y_train)
X_test_chi2 = selector_chi2.transform(X_test)

# Perform univariate feature selection using ANOVA F-test for numerical features
# Select the top k features
k_f_test = 5  # Adjust the number of features to select
selector_f_test = SelectKBest(score_func=f_classif, k=k_f_test)
X_train_f_test = selector_f_test.fit_transform(X_train, y_train)
X_test_f_test = selector_f_test.transform(X_test)

# Perform univariate feature selection using mutual information
# Select the top k features
k_mutual_info = 5  # Adjust the number of features to select
selector_mutual_info = SelectKBest(score_func=mutual_info_classif, k=k_mutual_info)
X_train_mutual_info = selector_mutual_info.fit_transform(X_train, y_train)
X_test_mutual_info = selector_mutual_info.transform(X_test)

# Train a classifier (e.g., Random Forest) on each set of selected features
# Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit and evaluate the model using features selected by chi-square test
rf_classifier.fit(X_train_chi2, y_train)
y_pred_chi2 = rf_classifier.predict(X_test_chi2)
accuracy_chi2 = accuracy_score(y_test, y_pred_chi2)
print("Accuracy using chi-square test:", accuracy_chi2)

# Fit and evaluate the model using features selected by ANOVA F-test
rf_classifier.fit(X_train_f_test, y_train)
y_pred_f_test = rf_classifier.predict(X_test_f_test)
accuracy_f_test = accuracy_score(y_test, y_pred_f_test)
print("Accuracy using ANOVA F-test:", accuracy_f_test)

# Fit and evaluate the model using features selected by mutual information
rf_classifier.fit(X_train_mutual_info, y_train)
y_pred_mutual_info = rf_classifier.predict(X_test_mutual_info)
accuracy_mutual_info = accuracy_score(y_test, y_pred_mutual_info)
print("Accuracy using mutual information:", accuracy_mutual_info)


Accuracy using chi-square test: 0.7402597402597403
Accuracy using ANOVA F-test: 0.7857142857142857
Accuracy using mutual information: 0.7402597402597403


In [None]:
# Get the scores/p-values from the feature selection process
scores_chi2 = selector_chi2.scores_
p_values_chi2 = selector_chi2.pvalues_

scores_f_test = selector_f_test.scores_
p_values_f_test = selector_f_test.pvalues_

scores_mutual_info = selector_mutual_info.scores_

# Combine the scores/p-values with feature names
feature_scores_chi2 = pd.DataFrame({
    'Feature': X.columns,
    'Chi-square Score': scores_chi2,
    'Chi-square p-value': p_values_chi2
})

feature_scores_f_test = pd.DataFrame({
    'Feature': X.columns,
    'F-test Score': scores_f_test,
    'F-test p-value': p_values_f_test
})

feature_scores_mutual_info = pd.DataFrame({
    'Feature': X.columns,
    'Mutual Information Score': scores_mutual_info
})

# Sort features based on scores/p-values
feature_scores_chi2 = feature_scores_chi2.sort_values(by='Chi-square Score', ascending=False)
feature_scores_f_test = feature_scores_f_test.sort_values(by='F-test Score', ascending=False)
feature_scores_mutual_info = feature_scores_mutual_info.sort_values(by='Mutual Information Score', ascending=False)

# Print or inspect the top features
print("Top features selected by chi-square test:")
print(feature_scores_chi2.head())

print("\nTop features selected by ANOVA F-test:")
print(feature_scores_f_test.head())

print("\nTop features selected by mutual information:")
print(feature_scores_mutual_info.head())


Top features selected by chi-square test:
       Feature  Chi-square Score  Chi-square p-value
4      Insulin       1197.140821       2.550816e-262
1      Glucose       1082.927430       1.696433e-237
7          Age        194.164018        3.921626e-44
5          BMI        107.766125        3.024417e-25
0  Pregnancies         77.452968        1.359208e-18

Top features selected by ANOVA F-test:
                    Feature  F-test Score  F-test p-value
1                   Glucose    160.769030    7.069402e-33
5                       BMI     63.378414    8.319708e-15
7                       Age     52.326951    1.410539e-12
0               Pregnancies     27.550001    2.113983e-07
6  DiabetesPedigreeFunction     14.977704    1.204708e-04

Top features selected by mutual information:
       Feature  Mutual Information Score
1      Glucose                  0.126491
4      Insulin                  0.061123
0  Pregnancies                  0.060843
7          Age                  0.054579
5

In [None]:
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler(feature_range = (0, 1))
dataset_scaled = sc.fit_transform(data)

In [None]:
dataset_scaled = pd.DataFrame(dataset_scaled)

In [None]:
X = data_scaled.drop(columns=['Pregnancies', 'DiabetesPedigreeFunction', 'SkinThickness', 'Outcome'])

Y = data_scaled['Outcome']

In [None]:
data_scaled.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
a = 0
b = 0
for i in Y:
    if i == 1:
        a += 1
    else:
        b += 1

print('diabetic:', a, 'non-diabetic:', b)



diabetic: 268 non-diabetic: 500


In [None]:

# Replacing zero values with NaN
dataset_new[["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]] = dataset_new[["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]].replace(0, np.NaN)

In [None]:
print(X)
print(Y)

     Glucose  BloodPressure  Insulin   BMI  Age
0        148             72        0  33.6   50
1         85             66        0  26.6   31
2        183             64        0  23.3   32
3         89             66       94  28.1   21
4        137             40      168  43.1   33
..       ...            ...      ...   ...  ...
763      101             76      180  32.9   63
764      122             70        0  36.8   27
765      121             72      112  26.2   30
766      126             60        0  30.1   47
767       93             70        0  30.4   23

[768 rows x 5 columns]
0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64


In [None]:
# Train test split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=41)

In [None]:
X_train.shape

(614, 5)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

(768, 5) (614, 5) (154, 5)


In [None]:
classifier = svm.SVC(kernel='linear')

In [None]:
#training the support vector Machine Classifier
classifier.fit(X_train, Y_train)

# accuracy score on the training data
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [None]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.7801302931596091


In [None]:
# accuracy score on the test data
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.7727272727272727


In [None]:
 #making predictive system
#input_data = (5,166,72,19,175,25.8,0.587,51)
input_data =  (1, 85, 66, 29, 0)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = classifier.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')

[0]
The person is not diabetic




In [None]:
import torch
import pandas as pd

# Assuming X_train, X_test, Y_train, and Y_test are pandas DataFrames
# Convert DataFrame values to numpy arrays
X_train_values = X_train.values
X_test_values = X_test.values
Y_train_values = Y_train.values
Y_test_values = Y_test.values

# Convert numpy arrays to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_values)
X_test_tensor = torch.FloatTensor(X_test_values)
Y_train_tensor = torch.FloatTensor(Y_train_values).view(-1, 1)
Y_test_tensor = torch.FloatTensor(Y_test_values).view(-1, 1)


In [None]:
# Initialize model
torch.manual_seed(42)
model = Model()

In [None]:
# Loss function
criterion = nn.BCELoss()  # Binary Cross Entropy Loss for binary classification

In [None]:
# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [None]:
# Train
epochs = 100
losses = []
for i in range(epochs):
    Y_pred = model.forward(X_train_tensor)  # Get predictions
    # Ensure Y_pred and Y_train_tensor have the same shape
    Y_pred = Y_pred.squeeze()  # Remove extra dimension if present
    loss = criterion(Y_pred, Y_train_tensor)

    losses.append(loss.item())
    if i % 10 == 0:
        print(f'Epoch : {i} and loss : {loss}')

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

Epoch : 0 and loss : 3.620408296585083
Epoch : 10 and loss : 0.9600316286087036
Epoch : 20 and loss : 0.6468938589096069
Epoch : 30 and loss : 0.606372058391571
Epoch : 40 and loss : 0.6000492572784424
Epoch : 50 and loss : 0.5932106971740723
Epoch : 60 and loss : 0.5867440700531006
Epoch : 70 and loss : 0.5822041034698486
Epoch : 80 and loss : 0.5786566734313965
Epoch : 90 and loss : 0.5751731395721436


In [None]:
# Test
with torch.no_grad():  # Turn off gradient tracking
    Y_eval = model.forward(X_test_tensor)
    # Convert Y_test_tensor to the same shape as Y_eval
    Y_test_tensor_reshaped = Y_test_tensor.view(-1, 1)

# Calculate loss using reshaped target tensor
    loss = criterion(Y_eval, Y_test_tensor_reshaped)

    # Calculate loss

In [None]:
# Count correct predictions
correct_predictions = ((Y_eval > 0.5) == Y_test_tensor.byte()).sum().item()

In [None]:
print("Length of Y_test_tensor:", len(Y_test_tensor))
print("Number of correct predictions:", correct_predictions)


Length of Y_test_tensor: 154
Number of correct predictions: 13046


In [None]:
# Print the number of correct predictions and total instances in the test set
print(f'Number of correct predictions: {correct_predictions} out of {len(Y_test_tensor)}')

Number of correct predictions: 13046 out of 154


In [None]:
import pickle

# Assuming 'new' is some object you want to pickle
new = {'example_key': 'example_value'}=

pickle.dump(model, open('prediction_model.pkl', 'wb'))

In [None]:
input_values = [8, 183, 64, 0, 0, 23.3, 0.672, 32]
input_tensor = torch.tensor([input_values], dtype=torch.float32)



In [None]:
with torch.no_grad():
    output = model(input_tensor)

# Output the predicted probability
print("chances of diabetes:", output.item()*100)

In [None]:
# Assuming model is your trained model instance of a class derived from nn.Module
torch.save(model.state_dict(), 'model_weights.pth')
