In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
# pip install scikit-learn

In [2]:
# Sample data (house prices and sizes)
data = {'Price': [250000, 300000, 500000, 800000, 200000],
        'Size': [1200, 1800, 2500, 3200, 1500]}

df = pd.DataFrame(data)

In [3]:
df

Unnamed: 0,Price,Size
0,250000,1200
1,300000,1800
2,500000,2500
3,800000,3200
4,200000,1500


In [4]:
# Create a StandardScaler object
scaler = StandardScaler()

# Fit the scaler to the data and transform the features
df[['Price', 'Size']] = scaler.fit_transform(df[['Price', 'Size']])

In [5]:
df

Unnamed: 0,Price,Size
0,-0.727273,-1.162192
1,-0.5,-0.332055
2,0.409091,0.636438
3,1.772727,1.604931
4,-0.954545,-0.747123


In [6]:
df.describe()

Unnamed: 0,Price,Size
count,5.0,5.0
mean,0.0,0.0
std,1.118034,1.118034
min,-0.954545,-1.162192
25%,-0.727273,-0.747123
50%,-0.5,-0.332055
75%,0.409091,0.636438
max,1.772727,1.604931


In [7]:
# Sample data (exam scores and study hours)
data = {'Exam_Score': [65, 80, 92, 70, 55],
        'Study_Hours': [5, 8, 12, 6, 3]}
df = pd.DataFrame(data)

In [8]:
df

Unnamed: 0,Exam_Score,Study_Hours
0,65,5
1,80,8
2,92,12
3,70,6
4,55,3


In [9]:
from sklearn.preprocessing import MinMaxScaler

# Create a MinMaxScaler object
scaler = MinMaxScaler()

# Fit the scaler to the data and transform the features
df[['Exam_Score', 'Study_Hours']] = scaler.fit_transform(df[['Exam_Score', 'Study_Hours']])

In [10]:
df

Unnamed: 0,Exam_Score,Study_Hours
0,0.27027,0.222222
1,0.675676,0.555556
2,1.0,1.0
3,0.405405,0.333333
4,0.0,0.0


In [11]:
df.describe()

Unnamed: 0,Exam_Score,Study_Hours
count,5.0,5.0
mean,0.47027,0.422222
std,0.38346,0.380058
min,0.0,0.0
25%,0.27027,0.222222
50%,0.405405,0.333333
75%,0.675676,0.555556
max,1.0,1.0


In [1]:
import numpy as np
from scipy.stats import norm

# pip install scipy


In [4]:
# Set the random seed for reproducibility
np.random.seed(42)

# Generate exam scores for 100 students following a normal distribution
# Mean (µ) = 75, Standard Deviation (σ) = 8
exam_scores = np.random.normal(loc=75, scale=8, size=100)
exam_scores = np.round(exam_scores).astype(int)  # Round to nearest integer

# Create a Pandas DataFrame
df = pd.DataFrame({'Exam_Score': exam_scores})

In [5]:
df.head()

Unnamed: 0,Exam_Score
0,79
1,74
2,80
3,87
4,73


In [6]:
# --- Part 1: Percentage between 67 and 83 ---
# Calculate z-scores
df['z_score'] = (df['Exam_Score'] - df['Exam_Score'].mean()) / df['Exam_Score'].std()
df

Unnamed: 0,Exam_Score,z_score
0,79,0.654209
1,74,-0.030110
2,80,0.791073
3,87,1.749119
4,73,-0.166974
...,...,...
95,63,-1.535612
96,77,0.380481
97,77,0.380481
98,75,0.106754


In [16]:
# Filter for scores between z-scores of -1 and +1
students_within_one_std = df[(df['z_score'] >= -1) & (df['z_score'] <= 1)]

# Calculate the percentage
percentage_within_one_std = (len(students_within_one_std) / len(df)) * 100

print("\n--- Part 1 ---")
print(f"Percentage of students between scores 67 and 83: {percentage_within_one_std:.2f}%")


--- Part 1 ---
Percentage of students between scores 67 and 83: 67.00%


In [7]:
# Filter for scores between z-scores of -1 and +1
students_within_one_std = df[(df['z_score'] >= -1) & (df['z_score'] <= 1)]
students_within_one_std

Unnamed: 0,Exam_Score,z_score
0,79,0.654209
1,74,-0.030110
2,80,0.791073
4,73,-0.166974
5,73,-0.166974
...,...,...
94,72,-0.303838
96,77,0.380481
97,77,0.380481
98,75,0.106754


In [8]:
# Calculate the percentage
percentage_within_one_std = (len(students_within_one_std) / len(df)) * 100
percentage_within_one_std

67.0

In [9]:
print("\n--- Part 1 ---")
print(f"Percentage of students between scores 67 and 83: {percentage_within_one_std:.2f}%")


--- Part 1 ---
Percentage of students between scores 67 and 83: 67.00%


In [17]:
# --- Part 2: Percentage above 91 ---
# Filter for scores with z-score above +2
students_above_two_std = df[df['z_score'] > 2]

# Calculate the percentage
percentage_above_two_std = (len(students_above_two_std) / len(df)) * 100

print("\n--- Part 2 ---")
print(f"Percentage of students with scores above 91: {percentage_above_two_std:.2f}%")


--- Part 2 ---
Percentage of students with scores above 91: 1.00%


In [10]:
students_above_two_std = df[df['z_score'] > 2]
students_above_two_std

Unnamed: 0,Exam_Score,z_score
31,90,2.15971


In [11]:
percentage_above_two_std = (len(students_above_two_std) / len(df)) * 100
percentage_above_two_std

1.0

In [18]:
# --- Part 3: Sarah's Score ---
# Find the z-score for the 84th percentile
sarahs_z_score = norm.ppf(0.84)  # Use the inverse CDF (percent point function)

# Calculate Sarah's score
sarahs_score = (sarahs_z_score * df['Exam_Score'].std()) + df['Exam_Score'].mean()

print("\n--- Part 3 ---")
print(f"Sarah's approximate score: {sarahs_score:.2f}")


--- Part 3 ---
Sarah's approximate score: 81.49


In [12]:
sarahs_z_score = norm.ppf(0.84)
sarahs_z_score

np.float64(0.994457883209753)

In [13]:
sarahs_score = (sarahs_z_score * df['Exam_Score'].std()) + df['Exam_Score'].mean()
sarahs_score

np.float64(81.48604138680975)

In [12]:
# Sample website traffic data (daily visitors)
traffic_data = [200, 220, 235, 215, 198, 205, 240, 230, 225, 210, 
                195, 218, 232, 228, 208, 170, 185, 202, 216, 224, 
                238, 245, 212, 209, 221, 236, 500, 227, 219, 233]

df = pd.DataFrame({'Visitors': traffic_data})

In [13]:
df.head()

Unnamed: 0,Visitors
0,200
1,220
2,235
3,215
4,198


In [14]:
# Calculate the z-score for each day's traffic
df['z_score'] = (df['Visitors'] - df['Visitors'].mean()) / df['Visitors'].std()
df.head()


Unnamed: 0,Visitors,z_score
0,200,-0.487876
1,220,-0.12013
2,235,0.155679
3,215,-0.212067
4,198,-0.52465


In [17]:
import numpy as np
# Define a threshold for outlier detection (e.g., z-score > 3 or z-score < -3)
outlier_threshold = 3

# Identify outliers
df['Outlier'] = np.where(np.abs(df['z_score']) > outlier_threshold, True, False)

print(df)


    Visitors   z_score  Outlier
0        200 -0.487876    False
1        220 -0.120130    False
2        235  0.155679    False
3        215 -0.212067    False
4        198 -0.524650    False
5        205 -0.395939    False
6        240  0.247615    False
7        230  0.063743    False
8        225 -0.028194    False
9        210 -0.304003    False
10       195 -0.579812    False
11       218 -0.156905    False
12       232  0.100517    False
13       228  0.026968    False
14       208 -0.340778    False
15       170 -1.039494    False
16       185 -0.763685    False
17       202 -0.451101    False
18       216 -0.193679    False
19       224 -0.046581    False
20       238  0.210841    False
21       245  0.339552    False
22       212 -0.267228    False
23       209 -0.322390    False
24       221 -0.101743    False
25       236  0.174066    False
26       500  5.028308     True
27       227  0.008581    False
28       219 -0.138517    False
29       233  0.118904    False


In [18]:
# Print the outlier days
print("\nOutlier Days:")
print(df[df['Outlier'] == True])


Outlier Days:
    Visitors   z_score  Outlier
26       500  5.028308     True


In [20]:
df.describe()

Unnamed: 0,Visitors,z_score
count,30.0,30.0
mean,226.533333,2.7292980000000002e-17
std,54.385427,1.0
min,170.0,-1.039494
25%,208.25,-0.3361807
50%,219.5,-0.1293239
75%,231.5,0.09132348
max,500.0,5.028308
