**Title**: Understanding the Dataset <br>

**Task**: For the given datasets below, identify the data types and dimensions.<br>

Task 1: Employee Dataset<br>
Columns: Employee ID , Name , Age , Department , Salary , Joining Date<br>
Task 2: Product Sales Dataset<br>
Columns: Product ID , Product Name , Price , Quantity Sold , Category , Sales Date<br>
Task 3: Student Grades Dataset<br>
Columns: Student ID , Student Name , Math Score , Science Score , English Score ,Year<br>

Instructions:<br>

Identify which columns are numerical (continuous or discrete) and which are categorical
(nominal or ordinal).<br>
Note down the dimensions (number of rows and columns) of the dataset.

In [None]:
import pandas as pd
import numpy as np

# Task 1: Employee Dataset
np.random.seed(0)
employee_data = {
    'Employee ID': np.arange(1, 101),
    'Name': [f'Employee {i}' for i in range(1, 101)],
    'Age': np.random.randint(25, 60, 100),
    'Department': np.random.choice(['Sales', 'Marketing', 'IT', 'HR'], 100),
    'Salary': np.random.randint(50000, 150000, 100),
    'Joining Date': pd.date_range(start='2020-01-01', periods=100)
}
employee_df = pd.DataFrame(employee_data)
print("Employee Dataset:")
print(employee_df.head())
print(f"Dimensions: {employee_df.shape}")
print(f"Data Types:\n{employee_df.dtypes}")

# Task 2: Product Sales Dataset
np.random.seed(0)
product_sales_data = {
    'Product ID': np.arange(1, 1001),
    'Product Name': [f'Product {i}' for i in range(1, 1001)],
    'Price': np.random.uniform(10, 100, 1000),
    'Quantity Sold': np.random.randint(1, 100, 1000),
    'Category': np.random.choice(['Electronics', 'Fashion', 'Home Goods'], 1000),
    'Sales Date': pd.date_range(start='2022-01-01', periods=1000)
}
product_sales_df = pd.DataFrame(product_sales_data)
print("\nProduct Sales Dataset:")
print(product_sales_df.head())
print(f"Dimensions: {product_sales_df.shape}")
print(f"Data Types:\n{product_sales_df.dtypes}")

# Task 3: Student Grades Dataset
np.random.seed(0)
student_grades_data = {
    'Student ID': np.arange(1, 501),
    'Student Name': [f'Student {i}' for i in range(1, 501)],
    'Math Score': np.random.randint(60, 100, 500),
    'Science Score': np.random.randint(60, 100, 500),
    'English Score': np.random.randint(60, 100, 500),
    'Year': np.random.randint(2018, 2023, 500)
}
student_grades_df = pd.DataFrame(student_grades_data)
print("\nStudent Grades Dataset:")
print(student_grades_df.head())
print(f"Dimensions: {student_grades_df.shape}")
print(f"Data Types:\n{student_grades_df.dtypes}")



**Title**: Checking for Missing Values<br>

**Task**: Identify and count the number of missing values in each dataset.<br>

Instructions:<br>
Use Python or any data manipulation tool to check for missing values in each column of the datasets. <br>Report the columns which have missing values and their counts.

In [None]:
import pandas as pd
import numpy as np

# Task 1: Employee Dataset
np.random.seed(0)
employee_data = {
    'Employee ID': np.arange(1, 101),
    'Name': [f'Employee {i}' for i in range(1, 101)],
    'Age': np.random.randint(25, 60, 100),
    'Department': np.random.choice(['Sales', 'Marketing', 'IT', 'HR'], 100),
    'Salary': np.random.randint(50000, 150000, 100),
    'Joining Date': pd.date_range(start='2020-01-01', periods=100)
}
# Introduce some missing values
employee_data['Age'] = [np.nan if i % 10 == 0 else x for i, x in enumerate(employee_data['Age'])]
employee_df = pd.DataFrame(employee_data)
print("Employee Dataset Missing Values:")
print(employee_df.isnull().sum())

# Task 2: Product Sales Dataset
np.random.seed(0)
product_sales_data = {
    'Product ID': np.arange(1, 1001),
    'Product Name': [f'Product {i}' for i in range(1, 1001)],
    'Price': np.random.uniform(10, 100, 1000),
    'Quantity Sold': np.random.randint(1, 100, 1000),
    'Category': np.random.choice(['Electronics', 'Fashion', 'Home Goods'], 1000),
    'Sales Date': pd.date_range(start='2022-01-01', periods=1000)
}
# Introduce some missing values
product_sales_data['Price'] = [np.nan if i % 100 == 0 else x for i, x in enumerate(product_sales_data['Price'])]
product_sales_df = pd.DataFrame(product_sales_data)
print("\nProduct Sales Dataset Missing Values:")
print(product_sales_df.isnull().sum())

# Task 3: Student Grades Dataset
np.random.seed(0)
student_grades_data = {
    'Student ID': np.arange(1, 501),
    'Student Name': [f'Student {i}' for i in range(1, 501)],
    'Math Score': np.random.randint(60, 100, 500),
    'Science Score': np.random.randint(60, 100, 500),
    'English Score': np.random.randint(60, 100, 500),
    'Year': np.random.randint(2018, 2023, 500)
}
# Introduce some missing values
student_grades_data['Math Score'] = [np.nan if i % 50 == 0 else x for i, x in enumerate(student_grades_data['Math Score'])]
student_grades_df = pd.DataFrame(student_grades_data)
print("\nStudent Grades Dataset Missing Values:")
print(student_grades_df.isnull().sum())



**Title**: Handling Outliers<br>

**Task**: Detect and propose handling methods for outliers in the numerical columns of the datasets.<br>

Task 1: Age in Employee Dataset<br>
Task 2: Price in Product Sales Dataset<br>
Task 3: Math Score in Student Grades Dataset<br>

Instructions:<br>

Use box plots to visualize potential outliers.<br>
Suggest methods to handle them, such as removal or transformation.

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Task 1: Employee Dataset
np.random.seed(0)
employee_data = {
    'Employee ID': np.arange(1, 101),
    'Name': [f'Employee {i}' for i in range(1, 101)],
    'Age': np.random.randint(25, 60, 100),
    'Department': np.random.choice(['Sales', 'Marketing', 'IT', 'HR'], 100),
    'Salary': np.random.randint(50000, 150000, 100),
    'Joining Date': pd.date_range(start='2020-01-01', periods=100)
}
# Introduce some outliers
employee_data['Age'][0:5] = [100, 105, 110, 115, 120]
employee_df = pd.DataFrame(employee_data)

# Task 2: Product Sales Dataset
np.random.seed(0)
product_sales_data = {
    'Product ID': np.arange(1, 1001),
    'Product Name': [f'Product {i}' for i in range(1, 1001)],
    'Price': np.random.uniform(10, 100, 1000),
    'Quantity Sold': np.random.randint(1, 100, 1000),
    'Category': np.random.choice(['Electronics', 'Fashion', 'Home Goods'], 1000),
    'Sales Date': pd.date_range(start='2022-01-01', periods=1000)
}
# Introduce some outliers
product_sales_data['Price'][0:5] = [1000, 1050, 1100, 1150, 1200]
product_sales_df = pd.DataFrame(product_sales_data)

# Task 3: Student Grades Dataset
np.random.seed(0)
student_grades_data = {
    'Student ID': np.arange(1, 501),
    'Student Name': [f'Student {i}' for i in range(1, 501)],
    'Math Score': np.random.randint(60, 100, 500),
    'Science Score': np.random.randint(60, 100, 500),
    'English Score': np.random.randint(60, 100, 500),
    'Year': np.random.randint(2018, 2023, 500)
}
# Introduce some outliers
student_grades_data['Math Score'][0:5] = [200, 205, 210, 215, 220]
student_grades_df = pd.DataFrame(student_grades_data)

# Detect and propose handling methods for outliers
datasets = {
    "Employee Dataset": {"df": employee_df, "column": "Age"},
    "Product Sales Dataset": {"df": product_sales_df, "column": "Price"},
    "Student Grades Dataset": {"df": student_grades_df, "column": "Math Score"}
}

for dataset_name, data in datasets.items():
    df = data["df"]
    column = data["column"]
    
    plt.figure(figsize=(8, 6))
    plt.boxplot(df[column])
    plt.title(f"Boxplot of {column} in {dataset_name}")
    plt.show()
    
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    print(f"\n{dataset_name}:")
    print(f"Number of outliers in {column}: {len(outliers)}")
    
    # Handling methods
    print("Handling methods:")
    print("1. Removal: Remove the outliers from the dataset.")
    print("2. Transformation: Apply a transformation to the data, such as log or square root, to reduce the effect of outliers.")
    print("3. Imputation: Replace the outliers with a suitable value, such as the median or mean.")



**Title**: Visualizing Data Distributions<br>

**Task**: Create visualizations for data distributions.<br>

Task 1: Histogram for Age in Employee Dataset<br>
Task 2: Distribution plot for Price in Product Sales Dataset<br>
Task 3: Histogram for Math Score in Student Grades Dataset

Instructions:<br>

Use matplotlib or seaborn in Python to create the plots.<br>
Comment on the skewness or normality of the distributions.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Task 1: Employee Dataset
np.random.seed(0)
employee_data = {
    'Employee ID': np.arange(1, 101),
    'Name': [f'Employee {i}' for i in range(1, 101)],
    'Age': np.random.randint(25, 60, 100),
    'Department': np.random.choice(['Sales', 'Marketing', 'IT', 'HR'], 100),
    'Salary': np.random.randint(50000, 150000, 100),
    'Joining Date': pd.date_range(start='2020-01-01', periods=100)
}
employee_df = pd.DataFrame(employee_data)

# Task 2: Product Sales Dataset
np.random.seed(0)
product_sales_data = {
    'Product ID': np.arange(1, 1001),
    'Product Name': [f'Product {i}' for i in range(1, 1001)],
    'Price': np.random.uniform(10, 100, 1000),
    'Quantity Sold': np.random.randint(1, 100, 1000),
    'Category': np.random.choice(['Electronics', 'Fashion', 'Home Goods'], 1000),
    'Sales Date': pd.date_range(start='2022-01-01', periods=1000)
}
product_sales_df = pd.DataFrame(product_sales_data)

# Task 3: Student Grades Dataset
np.random.seed(0)
student_grades_data = {
    'Student ID': np.arange(1, 501),
    'Student Name': [f'Student {i}' for i in range(1, 501)],
    'Math Score': np.random.randint(60, 100, 500),
    'Science Score': np.random.randint(60, 100, 500),
    'English Score': np.random.randint(60, 100, 500),
    'Year': np.random.randint(2018, 2023, 500)
}
student_grades_df = pd.DataFrame(student_grades_data)

# Create visualizations
datasets = {
    "Employee Dataset": {"df": employee_df, "column": "Age"},
    "Product Sales Dataset": {"df": product_sales_df, "column": "Price"},
    "Student Grades Dataset": {"df": student_grades_df, "column": "Math Score"}
}

for dataset_name, data in datasets.items():
    df = data["df"]
    column = data["column"]
    
    plt.figure(figsize=(8, 6))
    sns.histplot(df[column], kde=True)
    plt.title(f"Distribution of {column} in {dataset_name}")
    plt.show()
    
    # Comment on skewness or normality
    skewness = df[column].skew()
    if abs(skewness) < 0.5:
        print(f"\n{dataset_name}: The distribution of {column} is approximately normal.")
    elif skewness > 0.5:
        print(f"\n{dataset_name}: The distribution of {column} is positively skewed.")
    else:
        print(f"\n{dataset_name}: The distribution of {column} is negatively skewed.")



**Title**: Finding Relationships Between Features<br>

**Task**: Identify relationships between pairs of features in the datasets.<br>

Task 1: Salary vs Age in Employee Dataset<br>
Task 2: Price vs Quantity Sold in Product Sales Dataset<br>
Task 3: Math Score vs Science Score in Student Grades Dataset

Instructions: <br>

Use scatter plots or correlation coefficients to analyze the relationships.<br>
Describe any insights or patterns observed.

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Task 1: Employee Dataset
np.random.seed(0)
employee_data = {
    'Employee ID': np.arange(1, 101),
    'Name': [f'Employee {i}' for i in range(1, 101)],
    'Age': np.random.randint(25, 60, 100),
    'Department': np.random.choice(['Sales', 'Marketing', 'IT', 'HR'], 100),
    'Salary': np.random.randint(50000, 150000, 100),
    'Joining Date': pd.date_range(start='2020-01-01', periods=100)
}
employee_df = pd.DataFrame(employee_data)

# Task 2: Product Sales Dataset
np.random.seed(0)
product_sales_data = {
    'Product ID': np.arange(1, 1001),
    'Product Name': [f'Product {i}' for i in range(1, 1001)],
    'Price': np.random.uniform(10, 100, 1000),
    'Quantity Sold': np.random.randint(1, 100, 1000),
    'Category': np.random.choice(['Electronics', 'Fashion', 'Home Goods'], 1000),
    'Sales Date': pd.date_range(start='2022-01-01', periods=1000)
}
product_sales_df = pd.DataFrame(product_sales_data)

# Task 3: Student Grades Dataset
np.random.seed(0)
student_grades_data = {
    'Student ID': np.arange(1, 501),
    'Student Name': [f'Student {i}' for i in range(1, 501)],
    'Math Score': np.random.randint(60, 100, 500),
    'Science Score': np.random.randint(60, 100, 500),
    'English Score': np.random.randint(60, 100, 500),
    'Year': np.random.randint(2018, 2023, 500)
}
student_grades_df = pd.DataFrame(student_grades_data)

# Analyze relationships
datasets = [
    {"df": employee_df, "x": "Age", "y": "Salary"},
    {"df": product_sales_df, "x": "Price", "y": "Quantity Sold"},
    {"df": student_grades_df, "x": "Math Score", "y": "Science Score"}
]

for data in datasets:
    df = data["df"]
    x = data["x"]
    y = data["y"]
    
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=x, y=y, data=df)
    plt.title(f"Relationship between {x} and {y}")
    plt.show()
    
    correlation = df[x].corr(df[y])
    print(f"The correlation between {x} and {y} is {correlation:.2f}")
    
    if abs(correlation) > 0.7:
        print(f"The relationship between {x} and {y} is strong.")
    elif abs(correlation) > 0.3:
        print(f"The relationship between {x} and {y} is moderate.")
    else:
        print(f"The relationship between {x} and {y} is weak.")
    print()
