In [1]:
import pandas as pd
import numpy as np
from scipy import stats

# Step 1: Generate a random dataset
np.random.seed(42)  # for reproducibility
data = {
    'Feature1': np.random.normal(50, 10, 100),     # Mean=50, SD=10
    'Feature2': np.random.normal(30, 5, 100),      # Mean=30, SD=5
    'Feature3': np.random.normal(100, 20, 100),    # Mean=100, SD=20
}

df = pd.DataFrame(data)

# Step 2: Save dataset to CSV
df.to_csv("random_dataset.csv", index=False)
print("✅ CSV file 'random_dataset.csv' created.")

# Step 3: Detect outliers using Z-score
z_scores = np.abs(stats.zscore(df))  # calculate z-scores
threshold = 3  # typically Z > 3 is considered outlier
outliers = (z_scores > threshold)

# Step 4: Display outliers
outlier_rows = df[(outliers).any(axis=1)]
print("\n📊 Original Dataset (first 5 rows):")
print(df.head())

print("\n🚨 Outliers Detected:")
print(outlier_rows if not outlier_rows.empty else "No outliers found.")


✅ CSV file 'random_dataset.csv' created.

📊 Original Dataset (first 5 rows):
    Feature1   Feature2    Feature3
0  54.967142  22.923146  107.155747
1  48.617357  27.896773  111.215691
2  56.476885  28.286427  121.661025
3  65.230299  25.988614  121.076041
4  47.658466  29.193571   72.446613

🚨 Outliers Detected:
    Feature1   Feature2    Feature3
9   55.42560  29.627770  177.054630
62  38.93665  35.792978   35.174653


In [4]:
import os
print("📁 Current Working Directory:", os.getcwd())


📁 Current Working Directory: C:\Users\DELL


In [6]:
import pandas as pd
import numpy as np
import os

# Step 1: Generate dataset with null values
np.random.seed(42)
data = {
    'Age': np.random.randint(20, 40, size=10).astype(float),
    'Salary': np.random.randint(30000, 80000, size=10).astype(float),
    'Department': ['HR', 'Finance', 'IT', 'HR', np.nan, 'IT', 'Finance', 'HR', np.nan, 'IT']
}

# Introduce some NaN values
data['Age'][2] = np.nan
data['Salary'][5] = np.nan

df = pd.DataFrame(data)

# Step 2: Save to CSV
df.to_csv("dataset_with_nulls.csv", index=False)
print("✅ CSV file with nulls saved at:", os.getcwd())

# Step 3: Read the CSV back
df = pd.read_csv("dataset_with_nulls.csv")
print("\n📋 Original Data with Nulls:")
print(df)

# Step 4: Fill nulls
df_mean_filled = df.copy()
df_mean_filled['Age'] = df_mean_filled['Age'].fillna(df_mean_filled['Age'].mean())
df_mean_filled['Salary'] = df_mean_filled['Salary'].fillna(df_mean_filled['Salary'].mean())
df_mean_filled['Department'] = df_mean_filled['Department'].fillna(df_mean_filled['Department'].mode()[0])

print("\n🔄 Data After Filling Nulls (Mean/Mode):")
print(df_mean_filled)

# Optional: Save the cleaned data
df_mean_filled.to_csv("dataset_filled.csv", index=False)
print("\n✅ Cleaned dataset saved as 'dataset_filled.csv'")


✅ CSV file with nulls saved at: C:\Users\DELL

📋 Original Data with Nulls:
    Age   Salary Department
0  26.0  46023.0         HR
1  39.0  71090.0    Finance
2   NaN  31685.0         IT
3  30.0  30769.0         HR
4  27.0  32433.0        NaN
5  26.0      NaN         IT
6  38.0  67819.0    Finance
7  30.0  69188.0         HR
8  30.0  47568.0        NaN
9  23.0  49769.0         IT

🔄 Data After Filling Nulls (Mean/Mode):
         Age        Salary Department
0  26.000000  46023.000000         HR
1  39.000000  71090.000000    Finance
2  29.888889  31685.000000         IT
3  30.000000  30769.000000         HR
4  27.000000  32433.000000         HR
5  26.000000  49593.777778         IT
6  38.000000  67819.000000    Finance
7  30.000000  69188.000000         HR
8  30.000000  47568.000000         HR
9  23.000000  49769.000000         IT

✅ Cleaned dataset saved as 'dataset_filled.csv'


In [16]:
import numpy as np
from scipy import stats

# Sample data (1D array)
data = [10, 12, 11, 13, 14, 12, 100]  

# Convert to NumPy array
arr = np.array(data)

# Step 1: Calculate Z-scores
z_scores = np.abs(stats.zscore(arr))

# Step 2: Define threshold
threshold = 3

# Step 3: Detect outliers
outliers = arr[z_scores > threshold]

# Step 4: Show results
print("Original Data:", arr)
print("Z-scores:     ", z_scores)
print("Outliers:     ", outliers)


Original Data: [ 10  12  11  13  14  12 100]
Z-scores:      [0.47284084 0.40794112 0.44039098 0.37549125 0.34304139 0.40794112
 2.44764669]
Outliers:      []


In [14]:
import numpy as np
from scipy import stats

# Simple array with a clear outlier
data = [10, 11, 10, 12, 11, 1000]  # 1000 is a big outlier

# Convert to NumPy array
arr = np.array(data)

# Step 1: Calculate Z-scores
z_scores = np.abs(stats.zscore(arr))

# Step 2: Define threshold for outlier
threshold = 2  # Set to 2 to catch 1000 as an outlier in small data

# Step 3: Find indices where z-score > threshold
outlier_indices = np.where(z_scores > threshold)

# Step 4: Show results
print("Original Data:", arr)
print("Z-scores:     ", z_scores)
print("Outliers:     ", arr[outlier_indices])


Original Data: [  10   11   10   12   11 1000]
Z-scores:      [0.44938289 0.44667031 0.44938289 0.44395774 0.44667031 2.23606414]
Outliers:      [1000]
