In [None]:
# Install required packages (run this first in Google Colab)
!pip install numpy pandas matplotlib

# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import drive
import warnings
warnings.filterwarnings('ignore')

print("NumPy version:", np.__version__)
print("Pandas version:", pd.__version__)

NumPy version: 2.0.2
Pandas version: 2.2.2


In [None]:
# Creating NumPy arrays
arr1 = np.array([1, 2, 3, 4, 5])
arr2 = np.array([[1, 2, 3], [4, 5, 6]])

print("1D Array:", arr1)
print("2D Array:\n", arr2)
print("Array shape:", arr2.shape)
print("Array dtype:", arr1.dtype)

# Different ways to create arrays
zeros_array = np.zeros((3, 4))
ones_array = np.ones((2, 3))
range_array = np.arange(0, 10, 2)
linspace_array = np.linspace(0, 1, 5)

print("Zeros array:\n", zeros_array)
print("Ones array:\n", ones_array)
print("Range array:", range_array)
print("Linspace array:", linspace_array)

1D Array: [1 2 3 4 5]
2D Array:
 [[1 2 3]
 [4 5 6]]
Array shape: (2, 3)
Array dtype: int64
Zeros array:
 [[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
Ones array:
 [[1. 1. 1.]
 [1. 1. 1.]]
Range array: [0 2 4 6 8]
Linspace array: [0.   0.25 0.5  0.75 1.  ]


In [None]:
# Mathematical operations on arrays
a = np.array([1, 2, 3, 4, 5])
b = np.array([2, 3, 4, 5, 6])

# Element-wise operations
print("Addition:", a + b)
print("Multiplication:", a * b)
print("Power:", a ** 2)
print("Square root:", np.sqrt(a))

# Statistical operations
print("Mean:", np.mean(a))
print("Standard deviation:", np.std(a))
print("Min:", np.min(a))
print("Max:", np.max(a))

# Broadcasting example
matrix = np.array([[1, 2, 3], [4, 5, 6]])
vector = np.array([10, 20, 30])
result = matrix + vector
print("Broadcasting result:\n", result)

Addition: [ 3  5  7  9 11]
Multiplication: [ 2  6 12 20 30]
Power: [ 1  4  9 16 25]
Square root: [1.         1.41421356 1.73205081 2.         2.23606798]
Mean: 3.0
Standard deviation: 1.4142135623730951
Min: 1
Max: 5
Broadcasting result:
 [[11 22 33]
 [14 25 36]]


In [None]:
arr = np.array([10, 20, 30, 40, 50])

# Indexing
print("Element at index 2:", arr[2])

# Slicing
print("Sliced array (from index 1 to 3):", arr[1:4])

# 2D array slicing
arr_2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
print("Sliced 2D array (first two rows):\n", arr_2d[:2])

Element at index 2: 30
Sliced array (from index 1 to 3): [20 30 40]
Sliced 2D array (first two rows):
 [[1 2 3]
 [4 5 6]]


In [None]:
data = {'Name': ['Alice', 'Bob', 'Charlie'],
        'Age': [24, 27, 22],
        'City': ['New York', 'Los Angeles', 'Chicago']}

df = pd.DataFrame(data)
print("DataFrame:\n", df)

DataFrame:
       Name  Age         City
0    Alice   24     New York
1      Bob   27  Los Angeles
2  Charlie   22      Chicago


In [None]:
# Basic DataFrame information
print("\nDataFrame Info:")
print(df.info())
print("\nDataFrame Description:")
print(df.describe())
print("\nDataFrame Shape:", df.shape)
print("Column names:", df.columns.tolist())
# Display first 5 rows
print("First 5 rows of the dataset:")
print(df.head())
# Display last 5 rows
print("\nLast 5 rows of the dataset:")
print(df.tail())
# Check data types
print("\nData Types:")
print(df.dtypes)


DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    3 non-null      object
 1   Age     3 non-null      int64 
 2   City    3 non-null      object
dtypes: int64(1), object(2)
memory usage: 204.0+ bytes
None

DataFrame Description:
             Age
count   3.000000
mean   24.333333
std     2.516611
min    22.000000
25%    23.000000
50%    24.000000
75%    25.500000
max    27.000000

DataFrame Shape: (3, 3)
Column names: ['Name', 'Age', 'City']
First 5 rows of the dataset:
      Name  Age         City
0    Alice   24     New York
1      Bob   27  Los Angeles
2  Charlie   22      Chicago

Last 5 rows of the dataset:
      Name  Age         City
0    Alice   24     New York
1      Bob   27  Los Angeles
2  Charlie   22      Chicago

Data Types:
Name    object
Age      int64
City    object
dtype: object


In [None]:
# Accessing a single column
print("Names column:\n", df['Name'])

# Accessing multiple columns
print("\nName and Age columns:\n", df[['Name', 'Age']])

# Accessing rows using .loc[] (by label)
print("\nRow 1:\n", df.loc[0])

# Accessing rows using .iloc[] (by position)
print("\nRow at position 1:\n", df.iloc[1])

Names column:
 0      Alice
1        Bob
2    Charlie
Name: Name, dtype: object

Name and Age columns:
       Name  Age
0    Alice   24
1      Bob   27
2  Charlie   22

Row 1:
 Name       Alice
Age           24
City    New York
Name: 0, dtype: object

Row at position 1:
 Name            Bob
Age              27
City    Los Angeles
Name: 1, dtype: object


In [None]:
df_with_na = df.copy()
df_with_na.loc[1, 'Age'] = None

print("DataFrame with NaN:\n", df_with_na)

# Filling missing values
df_with_na['Age'] = df_with_na['Age'].fillna(df_with_na['Age'].mean())
print("\nDataFrame after filling NaN:\n", df_with_na)

# Dropping rows with NaN values
df_cleaned = df_with_na.dropna()
print("\nDataFrame after dropping rows with NaN:\n", df_cleaned)

DataFrame with NaN:
       Name   Age         City
0    Alice  24.0     New York
1      Bob   NaN  Los Angeles
2  Charlie  22.0      Chicago

DataFrame after filling NaN:
       Name   Age         City
0    Alice  24.0     New York
1      Bob  23.0  Los Angeles
2  Charlie  22.0      Chicago

DataFrame after dropping rows with NaN:
       Name   Age         City
0    Alice  24.0     New York
1      Bob  23.0  Los Angeles
2  Charlie  22.0      Chicago


In [None]:
# Convert NumPy array to DataFrame
np_array = np.random.randn(5, 3)
df_from_array = pd.DataFrame(np_array, columns=['A', 'B', 'C'])
print("DataFrame from NumPy array:")
print(df_from_array)

DataFrame from NumPy array:
          A         B         C
0  0.404519  0.402058 -0.027059
1 -2.031816 -0.081036  0.321894
2 -1.005860  0.270560  0.638147
3 -0.966869  0.514774  0.015928
4  0.872075  0.298183  0.956881


In [None]:
# Create sample sales data
sales_data = {
    'Region': ['North', 'South', 'East', 'West', 'North', 'South', 'East', 'West'],
    'Product': ['A', 'A', 'B', 'B', 'A', 'B', 'A', 'B'],
    'Sales': [100, 150, 200, 120, 110, 180, 190, 140],
    'Quantity': [10, 15, 20, 12, 11, 18, 19, 14]
}
sales_df = pd.DataFrame(sales_data)
print("Sales DataFrame:")
print(sales_df)

# Group by operations
region_sales = sales_df.groupby('Region')['Sales'].sum()
print("\nSales by Region:")
print(region_sales)

# Multiple groupby
product_region = sales_df.groupby(['Region', 'Product'])['Sales'].agg(['sum', 'mean', 'count'])
print("\nSales by Region and Product:")
print(product_region)

Sales DataFrame:
  Region Product  Sales  Quantity
0  North       A    100        10
1  South       A    150        15
2   East       B    200        20
3   West       B    120        12
4  North       A    110        11
5  South       B    180        18
6   East       A    190        19
7   West       B    140        14

Sales by Region:
Region
East     390
North    210
South    330
West     260
Name: Sales, dtype: int64

Sales by Region and Product:
                sum   mean  count
Region Product                   
East   A        190  190.0      1
       B        200  200.0      1
North  A        210  105.0      2
South  A        150  150.0      1
       B        180  180.0      1
West   B        260  130.0      2


In [None]:
# Create sample DataFrames for merging
df1 = pd.DataFrame({
    'ID': [1, 2, 3, 4],
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Department': ['IT', 'HR', 'Finance', 'IT']
})

df2 = pd.DataFrame({
    'ID': [1, 2, 5, 6],
    'Salary': [50000, 60000, 70000, 80000],
    'Experience': [2, 5, 3, 8]
})

print("DataFrame 1:")
print(df1)
print("\nDataFrame 2:")
print(df2)

# Different types of merges
inner_merge = pd.merge(df1, df2, on='ID', how='inner')
print("\nInner merge:")
print(inner_merge)

left_merge = pd.merge(df1, df2, on='ID', how='left')
print("\nLeft merge:")
print(left_merge)

right_merge = pd.merge(df1, df2, on='ID', how='right')
print("\nRight merge:")
print(right_merge)

outer_merge = pd.merge(df1, df2, on='ID', how='outer')
print("\nOuter merge:")
print(outer_merge)

# Concatenation
df_concat = pd.concat([df1, df2], ignore_index=True, sort=False)
print("\nConcatenated DataFrame:")
print(df_concat)

DataFrame 1:
   ID     Name Department
0   1    Alice         IT
1   2      Bob         HR
2   3  Charlie    Finance
3   4    David         IT

DataFrame 2:
   ID  Salary  Experience
0   1   50000           2
1   2   60000           5
2   5   70000           3
3   6   80000           8

Inner merge:
   ID   Name Department  Salary  Experience
0   1  Alice         IT   50000           2
1   2    Bob         HR   60000           5

Left merge:
   ID     Name Department   Salary  Experience
0   1    Alice         IT  50000.0         2.0
1   2      Bob         HR  60000.0         5.0
2   3  Charlie    Finance      NaN         NaN
3   4    David         IT      NaN         NaN

Right merge:
   ID   Name Department  Salary  Experience
0   1  Alice         IT   50000           2
1   2    Bob         HR   60000           5
2   5    NaN        NaN   70000           3
3   6    NaN        NaN   80000           8

Outer merge:
   ID     Name Department   Salary  Experience
0   1    Alice         I

In [None]:
from google.colab import drive
import warnings
warnings.filterwarnings('ignore')

In [None]:
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
file_path = '/content/drive/MyDrive/datasets/diabetes.csv'
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
df = pd.read_csv(file_path, names=columns)  # No header in raw CSV
print("Dataset loaded successfully. Shape:", df.shape)

Dataset loaded successfully. Shape: (769, 9)


In [None]:
from google.colab import files
uploaded = files.upload()

# Load the uploaded file
import io
df = pd.read_csv(io.BytesIO(uploaded['diabetes.csv']))

Saving diabetes.csv to diabetes.csv


In [None]:
data_array = df.values
print("NumPy array shape:", data_array.shape)
print("Data type:", data_array.dtype)

# Get specific columns as arrays
glucose = df['Glucose'].values
bmi = df['BMI'].values
outcome = df['Outcome'].values

NumPy array shape: (768, 9)
Data type: float64


In [None]:
print("Glucose - Mean:", np.mean(glucose))
print("Glucose - Median:", np.median(glucose))
print("Glucose - Std:", np.std(glucose))
print("Glucose - Min/Max:", np.min(glucose), np.max(glucose))

# BMI statistics
print("\nBMI - Mean:", np.mean(bmi))
print("BMI - 25th percentile:", np.percentile(bmi, 25))
print("BMI - 75th percentile:", np.percentile(bmi, 75))

Glucose - Mean: 120.89453125
Glucose - Median: 117.0
Glucose - Std: 31.95179590820272
Glucose - Min/Max: 0 199

BMI - Mean: 31.992578124999998
BMI - 25th percentile: 27.3
BMI - 75th percentile: 36.6


In [None]:
# Find patients with high glucose (>140)
high_glucose = glucose > 140
print("Patients with high glucose:", np.sum(high_glucose))

# Find patients with normal BMI (18.5-24.9)
normal_bmi = (bmi >= 18.5) & (bmi < 25)
print("Patients with normal BMI:", np.sum(normal_bmi))

# Combine conditions
healthy_patients = (glucose <= 140) & (bmi < 25)
print("Healthy patients:", np.sum(healthy_patients))

Patients with high glucose: 192
Patients with normal BMI: 102
Healthy patients: 107


In [None]:
# Quick overview
print(df.head())
print("\nDataset info:")
print(df.info())

# Basic statistics
print("\nDescriptive statistics:")
print(df.describe())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose       

In [None]:
# Select single column
ages = df['Age']
print("Average age:", ages.mean())

# Select multiple columns
health_metrics = df[['Glucose', 'BMI', 'BloodPressure']]
print(health_metrics.head())

# Filter rows
diabetic_patients = df[df['Outcome'] == 1]
print("Number of diabetic patients:", len(diabetic_patients))

# Complex filtering
young_diabetic = df[(df['Age'] < 30) & (df['Outcome'] == 1)]
print("Young diabetic patients:", len(young_diabetic))

Average age: 33.240885416666664
   Glucose   BMI  BloodPressure
0      148  33.6             72
1       85  26.6             66
2      183  23.3             64
3       89  28.1             66
4      137  43.1             40
Number of diabetic patients: 268
Young diabetic patients: 84


In [None]:
print("Missing values in each column:")
print(df.isnull().sum())

# In this dataset, 0 might mean missing data for some columns
# Let's check where we have 0 values
print("\nZero values in each column:")
print((df == 0).sum())

Missing values in each column:
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

Zero values in each column:
Pregnancies                 111
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                     500
dtype: int64


In [None]:
# BMI has 11 zero values that should be missing (BMI can't be 0!)
# Step 1: Check zeros first
print("Zero values in BMI:", (df['BMI'] == 0).sum())

# Step 2: Replace 0 with NaN (missing) - IMPORTANT: use np.nan
df['BMI'] = df['BMI'].replace(0, np.nan)

# Step 3: Check that replacement worked
print("Missing values after replacement:", df['BMI'].isnull().sum())

# Step 4: Calculate the average (ignoring missing values)
bmi_average = df['BMI'].mean()
print("Average BMI:", round(bmi_average, 1))

# Step 5: Fill missing values with average - SUPER EASY!
df['BMI'] = df['BMI'].fillna(bmi_average)

# Step 6: Final check - should be 0 missing values now
print("Missing values after filling:", df['BMI'].isnull().sum())
print("Zero values after filling:", (df['BMI'] == 0).sum())

# Step 7: Show before and after
print("\nBMI Summary after fixing:")
print("Min BMI:", round(df['BMI'].min(), 1))
print("Max BMI:", round(df['BMI'].max(), 1))
print("Average BMI:", round(df['BMI'].mean(), 1))

Zero values in BMI: 11
Missing values after replacement: 11
Average BMI: 32.5
Missing values after filling: 0
Zero values after filling: 0

BMI Summary after fixing:
Min BMI: 18.2
Max BMI: 67.1
Average BMI: 32.5


In [None]:
# Do the same for other columns that have impossible zeros
columns_to_fix = ['BloodPressure', 'Glucose', 'SkinThickness']

for column in columns_to_fix:
    # Step 1: Check zeros first
    zeros_before = (df[column] == 0).sum()
    print(f"\n{column} - Zero values before: {zeros_before}")

    # Step 2: Replace 0 with missing (np.nan)
    df[column] = df[column].replace(0, np.nan)

    # Step 3: Check replacement worked
    missing_after = df[column].isnull().sum()
    print(f"{column} - Missing values after replacement: {missing_after}")

    # Step 4: Fill with average
    average_value = df[column].mean()
    df[column] = df[column].fillna(average_value)

    # Step 5: Final check
    print(f"{column} - filled with average: {average_value:.1f}")
    print(f"{column} - Missing values after filling: {df[column].isnull().sum()}")


BloodPressure - Zero values before: 35
BloodPressure - Missing values after replacement: 35
BloodPressure - filled with average: 72.4
BloodPressure - Missing values after filling: 0

Glucose - Zero values before: 5
Glucose - Missing values after replacement: 5
Glucose - filled with average: 121.7
Glucose - Missing values after filling: 0

SkinThickness - Zero values before: 227
SkinThickness - Missing values after replacement: 227
SkinThickness - filled with average: 29.2
SkinThickness - Missing values after filling: 0


In [None]:
# Compare diabetic vs non-diabetic patients
diabetic_patients = df[df['Outcome'] == 1]
healthy_patients = df[df['Outcome'] == 0]

print("Diabetic patients - Average age:", diabetic_patients['Age'].mean())
print("Healthy patients - Average age:", healthy_patients['Age'].mean())

print("Diabetic patients - Average glucose:", diabetic_patients['Glucose'].mean())
print("Healthy patients - Average glucose:", healthy_patients['Glucose'].mean())

Diabetic patients - Average age: 37.06716417910448
Healthy patients - Average age: 31.19
Diabetic patients - Average glucose: 142.16557285655603
Healthy patients - Average glucose: 110.71012057667103


In [None]:
# Group by outcome and get averages
summary = df.groupby('Outcome')[['Age', 'Glucose', 'BMI']].mean()
print("Summary by diabetes status:")
print(summary)

Summary by diabetes status:
               Age     Glucose        BMI
Outcome                                  
0        31.190000  110.710121  30.888434
1        37.067164  142.165573  35.384757


In [None]:
# TASK: Find patients with highest and lowest BMI

# Find highest BMI
highest_bmi = df['BMI'].max()
patient_highest = df[df['BMI'] == highest_bmi]
print("Highest BMI:", highest_bmi)
print("Patient with highest BMI:")
print(patient_highest[['Age', 'BMI', 'Outcome']])

# Find lowest BMI
lowest_bmi = df['BMI'].min()
patient_lowest = df[df['BMI'] == lowest_bmi]
print("\nLowest BMI:", lowest_bmi)
print("Patient with lowest BMI:")
print(patient_lowest[['Age', 'BMI', 'Outcome']])

Highest BMI: 67.1
Patient with highest BMI:
     Age   BMI  Outcome
177   26  67.1        1

Lowest BMI: 18.2
Patient with lowest BMI:
     Age   BMI  Outcome
418   27  18.2        0
438   21  18.2        0
526   21  18.2        0


In [None]:
# Count how many patients in different age groups

# Create age groups
young = df[df['Age'] < 30]
middle = df[(df['Age'] >= 30) & (df['Age'] < 50)]
senior = df[df['Age'] >= 50]

print("Age Groups:")
print("Young (under 30):", len(young))
print("Middle (30-49):", len(middle))
print("Senior (50+):", len(senior))

# How many have diabetes in each group?
young_diabetic = young[young['Outcome'] == 1]
middle_diabetic = middle[middle['Outcome'] == 1]
senior_diabetic = senior[senior['Outcome'] == 1]

print("\nDiabetes in each age group:")
print("Young with diabetes:", len(young_diabetic))
print("Middle with diabetes:", len(middle_diabetic))
print("Senior with diabetes:", len(senior_diabetic))

Age Groups:
Young (under 30): 396
Middle (30-49): 283
Senior (50+): 89

Diabetes in each age group:
Young with diabetes: 84
Middle with diabetes: 141
Senior with diabetes: 43


In [None]:
# Create a simple health score
# High glucose = -1 point, Normal glucose = +1 point
# High BMI = -1 point, Normal BMI = +1 point

# Create health score
df['HealthScore'] = 0

# Add points for normal glucose (under 140)
df.loc[df['Glucose'] < 140, 'HealthScore'] += 1
df.loc[df['Glucose'] >= 140, 'HealthScore'] -= 1

# Add points for normal BMI (under 30)
df.loc[df['BMI'] < 30, 'HealthScore'] += 1
df.loc[df['BMI'] >= 30, 'HealthScore'] -= 1

# Show results
print("Health Score Distribution:")
print(df['HealthScore'].value_counts().sort_index())

# Show average health score by diabetes status
health_by_outcome = df.groupby('Outcome')['HealthScore'].mean()
print("\nAverage Health Score:")
print("Healthy patients:", health_by_outcome[0])
print("Diabetic patients:", health_by_outcome[1])

Health Score Distribution:
HealthScore
-2    154
 0    372
 2    242
Name: count, dtype: int64

Average Health Score:
Healthy patients: 0.704
Diabetic patients: -0.6567164179104478
