# BOYANA RAHUL
# 21mid0103

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

##Data import

In [None]:
import pandas as pd
df=pd.read_csv("/content/diabetes.csv")

##Understanding the data

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df['Glucose'].astype("float64")

##Preprocessing

In [None]:
columns_to_change=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'Age', 'Outcome']
for columns in columns_to_change:
  df[columns]=df[columns].astype(float)

In [None]:
df.info()

In [None]:
df.isnull().sum()

##Visualisation

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
columns=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
for i in columns:
  sns.boxplot(y=df[i])
  plt.show()

##handling outliers


In [None]:
def remove_outliers(df, columns):
    for column in columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df
# Remove outliers from the DataFrame
df_cleaned = remove_outliers(df, columns)

In [None]:
df_cleaned.shape

In [None]:
df_cleaned.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
columns=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
for i in columns:
  sns.boxplot(y=df_cleaned[i])
  plt.show()

##Correlation

In [None]:
df_cleaned.corr()['Outcome'].sort_values(ascending=False)

In [None]:
df_cleaned['Outcome'].value_counts()

In [None]:
X=df_cleaned.drop(['Outcome','SkinThickness'],axis=1)
Y=df_cleaned['Outcome']

##Normalization

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

##Dataset Split

In [None]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(X_scaled,Y, random_state=32,test_size=0.2)

In [None]:
print(xtrain.shape)
print(xtest.shape)
print(ytrain.shape)
print(ytest.shape)

##Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
model = RandomForestClassifier(random_state=1, max_depth=10)
model.fit(xtrain, ytrain)

pred_train = model.predict(xtrain)
train_score = accuracy_score(ytrain,pred_train)
print('train_accuracy_score',train_score)

pred_val = model.predict(xtest)
val_score = accuracy_score(ytest,pred_val)
print('val_accuracy_score',val_score)

##Hyper parameter tuning

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
param_dist = {'n_estimators': randint(1,1000),
              'max_depth': randint(1,100)}

rf = RandomForestClassifier()

# Use random search to find the best hyperparameters
rand_search = RandomizedSearchCV(rf,param_distributions = param_dist,n_iter=5,
                                 cv=3)
rand_search.fit(xtrain, ytrain)
# Create a variable for the best model
best_rf = rand_search.best_estimator_

# Print the best hyperparameters
print('Best hyperparameters:',  rand_search.best_params_)

In [None]:
# Generate predictions with the best model
pred_train = best_rf.predict(xtrain)
train_score = accuracy_score(ytrain,pred_train)
print('train_accuracy_score',train_score)

pred_val = best_rf.predict(xtest)
val_score = accuracy_score(ytest,pred_val)
print('val_accuracy_score',val_score)

In [None]:
#Classification for test before hyperparameter tuning
from sklearn.metrics import classification_report
print(classification_report(ytest,pred_val))

##Decisiontree


In [None]:

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
dt = DecisionTreeClassifier(max_depth=3, min_samples_leaf=10, random_state=1 )
scores = cross_val_score(dt, xtrain, ytrain, cv=5)
print(scores)
print(scores.mean())


In [None]:
# Defining an object for DTC and fitting for train dataset
dt = DecisionTreeClassifier(random_state=1)
dt.fit(xtrain, ytrain)

y_pred_train = dt.predict(xtrain)
y_pred = dt.predict(xtest)


In [None]:
print('Accuracy of Decision Tree-Train: ', accuracy_score(y_pred_train, ytrain))
print('Accuracy of Decision Tree-Test: ', accuracy_score(y_pred, ytest))

In [None]:
#Classification for test before hyperparameter tuning
from sklearn.metrics import classification_report
print(classification_report(ytest,y_pred))

##After hyper parameter tuning

In [None]:
dt = DecisionTreeClassifier(random_state=1)

params = {'max_depth' : randint(1,300),
        'min_samples_split': randint(1,300),
        'min_samples_leaf': (1,300)}

rand_search1 = RandomizedSearchCV(rf,
                                 param_distributions = params,
                                 n_iter=3,
                                 cv=3)
rand_search1.fit(xtrain, ytrain)
# Create a variable for the best model
#best_rf = rand_search1.best_estimator_

# Print the best hyperparameters
print('Best hyperparameters:',  rand_search1.best_params_)

In [None]:
# Passing best parameter for the Hyperparameter Tuning
dt = DecisionTreeClassifier(**rand_search1.best_params_, random_state=1)

dt.fit(xtrain, ytrain)

y_pred_train = dt.predict(xtrain)
y_prob_train = dt.predict_proba(xtrain)[:,1]

y_pred = dt.predict(xtest)
y_prob = dt.predict_proba(xtest)[:,1]

In [None]:
print('Accuracy of Decision Tree-Train: ', accuracy_score(y_pred_train, ytrain))
print('Accuracy of Decision Tree-Test: ', accuracy_score(y_pred, ytest))

In [None]:
#Classification for test after hyperparameter tuning
print(classification_report(ytest,y_pred))

##svm

In [None]:
#Import svm model
from sklearn import svm

#Create a svm Classifier
#clf = svm.SVC(kernel='linear') # Linear Kernel
clf = svm.SVC(kernel='poly', degree = 2) # Polynomial Kernel
#clf = svm.SVC(kernel='rbf', gamma = 0.5) # Radial Basis Function Kernel
#clf = svm.SVC(kernel='rbf', gamma = 0.5, C = 2) # Radial Basis Function Kernel

#Train the model using the training sets
clf.fit(xtrain, ytrain)

#Predict the response for test dataset
y_pred = clf.predict(xtest)

In [None]:
# Model Accuracy: how often is the classifier correct?

print("Accuracy:",accuracy_score(ytest, y_pred))
#Classification for test before hyperparameter tuning
from sklearn.metrics import classification_report
print(classification_report(ytest,y_pred))


In [None]:
# Performing CV to tune parameters for best SVM fit
paramssvm = {'C' : randint(1,100),
             'degree' : randint(1,100)}

rand_search2 = RandomizedSearchCV(clf,param_distributions = paramssvm,
                                 n_iter=3,
                                 cv=3)
rand_search2.fit(xtrain, ytrain)
# Print the best hyperparameters
print('Best hyperparameters:',  rand_search2.best_params_)

In [None]:
# Predict using tuned parameter
final_model = rand_search2.best_estimator_
Y_pred = final_model.predict(xtest)

pol_accuracy = accuracy_score(ytest, Y_pred)
#rbf_f1 = f1_score(ytest, Y_pred, average='weighted')

print('Accuracy (poly Kernel): ', "%.2f" % (pol_accuracy*100))


##Neural network

In [None]:
from keras.models import Sequential
from keras.layers import Dense
model=Sequential()

#input
model.add(Dense(7,activation="relu"))

#hidden layers
model.add(Dense(64,activation="relu"))
#model.add(Dense(64,activation="relu"))


#output
model.add(Dense(1,activation='sigmoid'))

model.compile(optimizer='adam',loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(x=xtrain,y=ytrain.values,validation_data=(xtest,ytest.values),batch_size=32,epochs=10)
model.summary()

In [None]:
loss_df=pd.DataFrame(model.history.history)
loss_df.plot(figsize=(12,6))
plt.title("Traing loss and validation loss")

In [None]:
import numpy as np
#Train your model using x_train_scaled and y_train
#Training data
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
y_tpred=model.predict(xtrain)
tmse = mean_squared_error(ytrain, y_tpred)
trmse = np.sqrt(tmse)
print("Training Mean Squared Error:", tmse)
print("Training Root Mean Squared Error:", trmse)

# Make predictions on test data
y_pred = model.predict(xtest)

# Calculate metrics
mse = mean_squared_error(ytest, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(ytest, y_pred)

print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("Mean Absolute Error:", mae)
