In [62]:
import pandas as pd
from scipy import stats
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

In [63]:
# Load data
data = pd.read_csv('diabetes.csv')

In [64]:
# Univariate Analysis
data.describe()  # Mean, Std, Min, Max

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [65]:
print('Frequency\n',data.var())
print('\n\nSkewness\n',data.skew())
print('\n\nKurtosis\n',data.kurtosis())

Frequency
 Pregnancies                    11.354056
Glucose                      1022.248314
BloodPressure                 374.647271
SkinThickness                 254.473245
Insulin                     13281.180078
BMI                            62.159984
DiabetesPedigreeFunction        0.109779
Age                           138.303046
Outcome                         0.227483
dtype: float64


Skewness
 Pregnancies                 0.901674
Glucose                     0.173754
BloodPressure              -1.843608
SkinThickness               0.109372
Insulin                     2.272251
BMI                        -0.428982
DiabetesPedigreeFunction    1.919911
Age                         1.129597
Outcome                     0.635017
dtype: float64


Kurtosis
 Pregnancies                 0.159220
Glucose                     0.640780
BloodPressure               5.180157
SkinThickness              -0.520072
Insulin                     7.214260
BMI                         3.290443
DiabetesPed

In [66]:
# Linear Regression
X = data[['Glucose']]
y = data['BMI']
linear_model = LinearRegression()
linear_model.fit(X, y)
y_pred = linear_model.predict(X)

print("Intercept:", linear_model.intercept_)
print("Slope:", linear_model.coef_[0])
print("R^2 Score:", r2_score(y, y_pred))
print("Mean Squared Error:", mean_squared_error(y, y_pred))

Intercept: 25.402116839076637
Slope: 0.05451413904153221
R^2 Score: 0.04887241775173845
Mean Squared Error: 59.04509338515581


In [67]:
# Logistic Regression
X = data[['Pregnancies', 'Glucose']]
y = data['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

logistic_model = LogisticRegression(max_iter=200)
logistic_model.fit(X_train, y_train)
y_pred = logistic_model.predict(X_test)

print("Accuracy Score:", accuracy_score(y_test, y_pred))

Accuracy Score: 0.7229437229437229


In [68]:
# Multiple Linear Regression
X = data[[ 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']]
y = data['Outcome']

multiple_reg_model = LinearRegression()
multiple_reg_model.fit(X, y)
y_pred = multiple_reg_model.predict(X)

print("\nMultiple Linear Regression:")
print("Intercept:", multiple_reg_model.intercept_)
print("Coefficients:", multiple_reg_model.coef_)
print("R^2 Score:", r2_score(y, y_pred))
print("Mean Squared Error:", mean_squared_error(y, y_pred))


Multiple Linear Regression:
Intercept: -0.8811057082317502
Coefficients: [ 0.0059325  -0.00227884  0.00016698 -0.00020962  0.01331084  0.13767816
  0.00580068]
R^2 Score: 0.2884622281209426
Mean Squared Error: 0.16165171548087853
