In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the dataset
diabetes_df = pd.read_csv('diabetes.csv')

# Display the first few rows of the dataframe
print(diabetes_df.head())

# Print the shape of the dataframe
print("Shape of the dataframe:", diabetes_df.shape)

# Print the distribution of the target variable 'Outcome'
print("Value counts of 'Outcome':")
print(diabetes_df['Outcome'].value_counts())

# Display basic information about the dataframe
print("Info about the dataframe:")
print(diabetes_df.info())

# Display basic statistics about the dataset
print("Descriptive statistics of the dataframe:")
print(diabetes_df.describe())

# Show the mean of each feature grouped by the 'Outcome' variable
print("Mean of features grouped by 'Outcome':")
print(diabetes_df.groupby('Outcome').mean())

# Separate features (X) and target variable (y)
X = diabetes_df.drop('Outcome', axis=1)
y = diabetes_df['Outcome']

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# Initialize and train the Logistic Regression model
lg = LogisticRegression()
lg.fit(X_train, y_train)

# Make predictions on the training and test set
train_y_pred = lg.predict(X_train)
test_y_pred = lg.predict(X_test)

# Print the accuracy of the model
print('Train set Accuracy:', accuracy_score(y_train, train_y_pred))
print('Test set Accuracy:', accuracy_score(y_test, test_y_pred))

# Make a prediction on new input data
input_data = (1, 85, 66, 29, 0, 26.6, 0.351, 31)
input_data_nparray = np.asarray(input_data)
reshaped_input_data = input_data_nparray.reshape(1, -1)
prediction = lg.predict(reshaped_input_data)

# Interpret the prediction
if prediction == 1:
    print('This person has diabetes.')
else:
    print('This person does not have diabetes.')


   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  
Shape of the dataframe: (768, 9)
Value counts of 'Outcome':
Outcome
0    500
1    268
Name: count, dtype: int64
Info about the dataframe:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  -