In [4]:
import os
import pandas as pd

# Step 1: File Path
file_path = r'bank.csv'

# Step 2: Check if the file exists
if not os.path.exists(file_path):
    print(f"Error: File not found at {file_path}. Please check the file path.")
else:
    # Step 3: Load the dataset
    try:
        df = pd.read_csv(file_path)

        # Task 2(a): Display DataFrame info and find object columns
        print("\nDataFrame Information:")
        df.info()

        object_columns = df.select_dtypes(include=['object']).columns
        print("\nColumns with dtype=object:")
        print(object_columns)

        # Task 2(b): Get unique values for object columns
        print("\nUnique values in object columns:")
        for col in object_columns:
            print(f"{col}: {df[col].unique()}")

        # Task 2(c): Check for null values in each column
        print("\nNull values in each column:")
        print(df.isnull().sum())

        # Task 3: Drop object columns and save the new DataFrame
        numeric_df = df.drop(columns=object_columns)
        numeric_file_path = r'C:\Worksheet_2\banknumericdata.csv'
        numeric_df.to_csv(numeric_file_path, index=False)
        print(f"\nNumeric data saved to {numeric_file_path}")

        # Task 4: Read the numeric CSV and find summary statistics
        numeric_df_reloaded = pd.read_csv(numeric_file_path)
        print("\nSummary statistics for numeric data:")
        print(numeric_df_reloaded.describe())

    except Exception as e:
        print(f"An error occurred: {e}")



DataFrame Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB

Columns with dtype=object:
Index(['job', 'marital', 'education', 'default'

In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load Titanic dataset
df = pd.read_csv("Titanic-Dataset.csv")

# Problem 1: Subset and Summary Statistics
# Subset the DataFrame for specific columns and rows where 'Pclass' == 1
subset_df = df[['Name', 'Pclass', 'Sex', 'Age', 'Fare', 'Survived']][df['Pclass'] == 1]

# Calculate mean, median, max, and min of the 'Fare' column
fare_mean = subset_df['Fare'].mean()
fare_median = subset_df['Fare'].median()
fare_max = subset_df['Fare'].max()
fare_min = subset_df['Fare'].min()

print(f"Fare - Mean: {fare_mean}, Median: {fare_median}, Max: {fare_max}, Min: {fare_min}")

# Problem 2: Check for null values in the 'Age' column and drop them
null_age_count = subset_df['Age'].isnull().sum()
print(f"Number of null values in 'Age': {null_age_count}")

# Drop rows with null 'Age' values
subset_df = subset_df.dropna(subset=['Age'])

# Problem 3: One-hot encoding of 'Embarked' column
# One-hot encode 'Embarked' column
embarked_dummies = pd.get_dummies(df['Embarked'], prefix='Embarked')
df = pd.concat([df, embarked_dummies], axis=1)

# Drop the original 'Embarked' column
df = df.drop('Embarked', axis=1)

# Print the first few rows to verify changes
print(df.head())

# Problem 4: Compare survival rates by gender
# Calculate mean survival rates by gender
survival_by_gender = df.groupby('Sex')['Survived'].mean()
print("Mean Survival Rates by Gender:")
print(survival_by_gender)

# Visualization of survival rates by gender
sns.barplot(data=df, x='Sex', y='Survived')
plt.title("Survival Rates by Gender")
plt.show()

# Problem 5: Survival rates by gender and port of embarkation
# Visualization
sns.catplot(data=df, x='Sex', y='Survived', hue='Embarked', kind='bar')
plt.title("Survival Rates by Gender and Embarkation Port")
plt.show()

# Problem 6 (Optional): Survival rates by age group and class
# Create age quantiles
df['AgeGroup'] = pd.qcut(df['Age'].dropna(), q=5, labels=["Q1", "Q2", "Q3", "Q4", "Q5"])

# Group by 'Pclass' and 'AgeGroup' and calculate mean survival rates
survival_by_age_class = df.groupby(['Pclass', 'AgeGroup'])['Survived'].mean().unstack()
print("Survival Rates by Age Group and Class:")
print(survival_by_age_class)

# Visualization
survival_by_age_class.plot(kind='bar', figsize=(10, 6))
plt.title("Survival Rates by Age Group and Class")
plt.xlabel("Passenger Class")
plt.ylabel("Mean Survival Rate")
plt.legend(title="Age Group")
plt.xticks(rotation=0)
plt.show()

ModuleNotFoundError: No module named 'seaborn'

In [7]:
import pandas as pd

# Sample data creation for demonstration (replace with actual dataset loading if available)
data = {
    'Name': ['John Doe', 'Jane Smith', 'Alice Brown', 'Bob White'],
    'Pclass': [1, 2, 1, 3],
    'Sex': ['male', 'female', 'female', 'male'],
    'Age': [22, 34, 28, 45],
    'Fare': [71.2833, 10.5, 80.0, 7.25],
    'Survived': [1, 0, 1, 0]
}

# Creating the full DataFrame
df = pd.DataFrame(data)

# Subsetting the DataFrame for specific columns
subset_df = df[['Name', 'Pclass', 'Sex', 'Age', 'Fare', 'Survived']]

# Retaining only rows where 'Pclass' == 1
first_class_df = subset_df[subset_df['Pclass'] == 1]

# Calculating statistics for 'Fare'
fare_mean = first_class_df['Fare'].mean()
fare_median = first_class_df['Fare'].median()
fare_max = first_class_df['Fare'].max()
fare_min = first_class_df['Fare'].min()

first_class_df, fare_mean, fare_median, fare_max, fare_min

(          Name  Pclass     Sex  Age     Fare  Survived
 0     John Doe       1    male   22  71.2833         1
 2  Alice Brown       1  female   28  80.0000         1,
 np.float64(75.64165),
 np.float64(75.64165),
 np.float64(80.0),
 np.float64(71.2833))

In [6]:
import pandas as pd

# Step 1: Load the dataset
file_path = r'medical_students_dataset.csv'

# Step 2: Check if the file exists
try:
    df = pd.read_csv(file_path)
    print("Dataset loaded successfully!\n")

    # Task 2: Check DataFrame info and identify columns with missing values
    print("\nDataFrame Information:")
    df.info()

    print("\nNumber of missing values in each column:")
    missing_values = df.isnull().sum()
    print(missing_values)

    # Task 3: Handle missing values
    for column in missing_values[missing_values > 0].index:
        print(f"\nHandling missing values for column: {column}")
        
        # Strategy selection example (to be customized based on data analysis)
        if df[column].dtype == 'object':
            # Fill missing values with mode for categorical data
            df[column].fillna(df[column].mode()[0], inplace=True)
            print(f"Filled missing values in {column} with mode.")
        else:
            # Fill missing values with mean for numeric data
            df[column].fillna(df[column].mean(), inplace=True)
            print(f"Filled missing values in {column} with mean.")

    print("\nMissing values after handling:")
    print(df.isnull().sum())

    # Task 4: Check and handle duplicate values
    print(f"\nNumber of duplicate rows: {df.duplicated().sum()}")
    if df.duplicated().sum() > 0:
        df = df.drop_duplicates()
        print("Duplicate rows removed.")

    print(f"\nNumber of rows after removing duplicates: {len(df)}")

except FileNotFoundError:
    print(f"Error: File not found at {file_path}. Please check the file path.")
except Exception as e:
    print(f"An error occurred: {e}")

Dataset loaded successfully!


DataFrame Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 13 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Student ID      180000 non-null  float64
 1   Age             180000 non-null  float64
 2   Gender          180000 non-null  object 
 3   Height          180000 non-null  float64
 4   Weight          180000 non-null  float64
 5   Blood Type      180000 non-null  object 
 6   BMI             180000 non-null  float64
 7   Temperature     180000 non-null  float64
 8   Heart Rate      180000 non-null  float64
 9   Blood Pressure  180000 non-null  float64
 10  Cholesterol     180000 non-null  float64
 11  Diabetes        180000 non-null  object 
 12  Smoking         180000 non-null  object 
dtypes: float64(9), object(4)
memory usage: 19.8+ MB

Number of missing values in each column:
Student ID        20000
Age               20000
Ge

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].mode()[0], inplace=True)


Filled missing values in BMI with mean.

Handling missing values for column: Temperature
Filled missing values in Temperature with mean.

Handling missing values for column: Heart Rate
Filled missing values in Heart Rate with mean.

Handling missing values for column: Blood Pressure
Filled missing values in Blood Pressure with mean.

Handling missing values for column: Cholesterol
Filled missing values in Cholesterol with mean.

Handling missing values for column: Diabetes
Filled missing values in Diabetes with mode.

Handling missing values for column: Smoking
Filled missing values in Smoking with mode.

Missing values after handling:
Student ID        0
Age               0
Gender            0
Height            0
Weight            0
Blood Type        0
BMI               0
Temperature       0
Heart Rate        0
Blood Pressure    0
Cholesterol       0
Diabetes          0
Smoking           0
dtype: int64

Number of duplicate rows: 12572
Duplicate rows removed.

Number of rows after remo

In [1]:
import os
print("Current working directory: ", os.getcwd())
print(os.listdir())

Current working directory:  C:\Users\HP\Desktop\worksheet3
['.ipynb_checkpoints', 'bank.csv', 'medical_students_dataset.csv', 'Titanic-Dataset.csv', 'Untitled1.ipynb', 'workshop3.ipynb']
