In [1]:
import pandas as pd
import matplotlib as plt
import numpy as np
import seaborn as sns
from scipy.stats import skew, kurtosis, mode

df = pd.read_csv("diabetes.csv")
df.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [2]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [4]:
desc_stats = df.describe().T
desc_stats['mean'] = df.mean()
desc_stats['mode'] = df.mode().iloc[0]
desc_stats['median'] = df.median()
desc_stats['skewness'] = df.skew()
desc_stats['kurtosis'] = df.kurtosis()
desc_stats['var'] = df.var()

print(desc_stats[['mean', 'median', 'mode', 'std', 'var', 'skewness', 'kurtosis']])


                                mean    median    mode         std  \
Pregnancies                 3.845052    3.0000   1.000    3.369578   
Glucose                   120.894531  117.0000  99.000   31.972618   
BloodPressure              69.105469   72.0000  70.000   19.355807   
SkinThickness              20.536458   23.0000   0.000   15.952218   
Insulin                    79.799479   30.5000   0.000  115.244002   
BMI                        31.992578   32.0000  32.000    7.884160   
DiabetesPedigreeFunction    0.471876    0.3725   0.254    0.331329   
Age                        33.240885   29.0000  22.000   11.760232   
Outcome                     0.348958    0.0000   0.000    0.476951   

                                   var  skewness  kurtosis  
Pregnancies                  11.354056  0.901674  0.159220  
Glucose                    1022.248314  0.173754  0.640780  
BloodPressure               374.647271 -1.843608  5.180157  
SkinThickness               254.473245  0.109372 -0.520

In [5]:
# Only for any particular single column
col = 'Glucose'
print("Frequency:\n", df[col].value_counts().head())
print("Mean:", df[col].mean())
print("Median:", df[col].median())
print("Mode:", df[col].mode()[0])
print("Variance:", df[col].var())
print("Standard Deviation:", df[col].std())
print("Skewness:", skew(df[col]))
print("Kurtosis:", kurtosis(df[col]))


Frequency:
 Glucose
99     17
100    17
111    14
125    14
129    14
Name: count, dtype: int64
Mean: 120.89453125
Median: 117.0
Mode: 99
Variance: 1022.2483142519557
Standard Deviation: 31.97261819513622
Skewness: 0.17341395519987735
Kurtosis: 0.6288133337300685


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

X = df[['BMI']]
y = df['Glucose']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

model_lr = LinearRegression()
model_lr.fit(X_train, y_train)
y_pred = model_lr.predict(X_test)

# Evaluation
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("Bivariate Regression R^2 :", r2)
print("Bivariate RMSE:", rmse)


Bivariate Regression R²: 0.02426504687557307
Bivariate RMSE: 32.21484563698747


In [None]:
X = df.drop(columns=['Glucose'])
y = df['Glucose']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

model_mlr = LinearRegression()
model_mlr.fit(X_train, y_train)
y_pred_mlr = model_mlr.predict(X_test)

# Evaluation
r2_mlr = r2_score(y_test, y_pred_mlr)
rmse_mlr = np.sqrt(mean_squared_error(y_test, y_pred_mlr))

print("Multiple Regression R^2:", r2_mlr)
print("Multiple Regression RMSE:", rmse_mlr)


Multiple Regression R²: 0.32482089347994414
Multiple Regression RMSE: 26.7978295609986
