In [None]:
# prompt: Write a code in python for analysing “mailbox”
# dataset
# a. . How many emails did I send during a given
# timeframe?
# b. At what times of the day do I send and receive
# emails with Gmail?
# c. What is the average number of emails per day?
# d. What is the average number of emails per hour?

import pandas as pd
import matplotlib.pyplot as plt

# Load the mailbox data (replace 'mailbox_data.csv' with your actual file)
try:
    mailbox_df = pd.read_csv('mailbox_data.csv')  # Assumes CSV format; adjust if different
except FileNotFoundError:
    print("Error: 'mailbox_data.csv' not found. Please provide the correct path.")
    exit()


# Assuming your mailbox data has columns like 'Date', 'Time', 'Sender', 'Recipient', 'Subject'

# Convert 'Date' and 'Time' to datetime objects if they aren't already
mailbox_df['Date'] = pd.to_datetime(mailbox_df['Date'])
mailbox_df['Time'] = pd.to_datetime(mailbox_df['Time']).dt.time

# a. Emails sent during a given timeframe
def emails_sent_in_timeframe(start_date, end_date):
    sent_emails = mailbox_df[(mailbox_df['Date'] >= start_date) & (mailbox_df['Date'] <= end_date) & (mailbox_df['Sender'] == 'my_email@gmail.com')]
    return len(sent_emails)

# Example usage
start_date = '2024-01-01'
end_date = '2024-01-31'
num_emails_sent = emails_sent_in_timeframe(start_date, end_date)
print(f"Number of emails sent between {start_date} and {end_date}: {num_emails_sent}")


# b. Email sending/receiving times with Gmail
def gmail_email_times(sender_or_recipient):
  emails = mailbox_df[(mailbox_df['Sender'].str.contains('@gmail.com') & (mailbox_df['Sender'] == sender_or_recipient)) | (mailbox_df['Recipient'].str.contains('@gmail.com'))]
  return emails['Time'].value_counts()

gmail_times = gmail_email_times('my_email@gmail.com')
print("\nEmail times with Gmail:\n", gmail_times)


# c. Average number of emails per day
mailbox_df['DateOnly'] = mailbox_df['Date'].dt.date
emails_per_day = mailbox_df.groupby('DateOnly').size()
avg_emails_per_day = emails_per_day.mean()
print(f"\nAverage emails per day: {avg_emails_per_day}")

# d. Average number of emails per hour
mailbox_df['Hour'] = mailbox_df['Time'].apply(lambda t: t.hour)
emails_per_hour = mailbox_df.groupby('Hour').size()
avg_emails_per_hour = emails_per_hour.mean()
print(f"\nAverage emails per hour: {avg_emails_per_hour}")

# (Optional) Visualization:
plt.figure(figsize=(10, 6))
plt.plot(emails_per_hour.index, emails_per_hour.values)
plt.xlabel("Hour of the Day")
plt.ylabel("Number of Emails")
plt.title("Email Frequency by Hour")
plt.grid(True)
plt.show()

In [1]:
# prompt: Write a code in python for “titanic” dataset for the
# following:-
# A, Check for missing values and handle them.
# B. Check for duplicates and handle them.
# C. Check for outliers and handle them.
# D. Check for data types and convert them if
# necessary.

import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Load the Titanic dataset
# Replace 'titanic.csv' with the actual path to your dataset file
try:
    titanic_df = pd.read_csv('titanic.csv')
except FileNotFoundError:
    print("Error: 'titanic.csv' not found. Please ensure the file is in the current directory or provide the correct path.")
    exit()


# A. Handle missing values
# Identify columns with missing values
missing_values = titanic_df.isnull().sum()
print("Missing Values:\n", missing_values)


# Impute missing 'Age' with the median
imputer_age = SimpleImputer(strategy='median')
titanic_df['Age'] = imputer_age.fit_transform(titanic_df[['Age']])

# Impute missing 'Embarked' with the most frequent value
imputer_embarked = SimpleImputer(strategy='most_frequent')
titanic_df['Embarked'] = imputer_embarked.fit_transform(titanic_df[['Embarked']])


# Fill missing 'Cabin' with 'Unknown' (or another appropriate strategy)
titanic_df['Cabin'].fillna('Unknown', inplace=True)


# Drop rows where 'Fare' is missing (if any exist)
titanic_df.dropna(subset=['Fare'], inplace=True)


# B. Handle duplicates
# Check for duplicate rows
print("\nDuplicate Rows:", titanic_df.duplicated().sum())

# Remove duplicate rows (if any exist)
titanic_df.drop_duplicates(inplace=True)

# C. Handle outliers
# Handle outliers in 'Fare' using IQR method
Q1_fare = titanic_df['Fare'].quantile(0.25)
Q3_fare = titanic_df['Fare'].quantile(0.75)
IQR_fare = Q3_fare - Q1_fare

lower_bound_fare = Q1_fare - 1.5 * IQR_fare
upper_bound_fare = Q3_fare + 1.5 * IQR_fare

titanic_df = titanic_df[(titanic_df['Fare'] >= lower_bound_fare) & (titanic_df['Fare'] <= upper_bound_fare)]

# Consider other columns like Age for outlier treatment as needed

# D. Handle data types
# Convert 'Age' and 'Fare' to numeric if necessary
titanic_df['Age'] = pd.to_numeric(titanic_df['Age'])
titanic_df['Fare'] = pd.to_numeric(titanic_df['Fare'])

# Example of handling categorical features: Convert 'Sex' and 'Embarked' to numerical using one-hot encoding
titanic_df = pd.get_dummies(titanic_df, columns=['Sex', 'Embarked'], drop_first=True)


# Print information about the cleaned data
print("\nData types:\n", titanic_df.dtypes)

print("\nDescriptive Statistics:\n", titanic_df.describe())

print("\nShape of the cleaned dataframe: ", titanic_df.shape)

Error: 'titanic.csv' not found. Please ensure the file is in the current directory or provide the correct path.


NameError: name 'titanic_df' is not defined

In [None]:
# prompt: Write a code in python for “sales.csv” following
# a. Add a column “Totalprice” paid for each
# purchase.
# b. List the transaction that exceeds 3,000,000 for
# “Totalprice”
# c. Find the sales in the year “2015” and “1973”
# d. List the clients established before the year
# “2001”

import pandas as pd

# Load the sales data
try:
    sales_df = pd.read_csv('sales.csv')
except FileNotFoundError:
    print("Error: 'sales.csv' not found. Please provide the correct path.")
    exit()

# Assuming your sales data has columns like 'Quantity', 'UnitPrice'

# a. Add a 'TotalPrice' column
sales_df['TotalPrice'] = sales_df['Quantity'] * sales_df['UnitPrice']

# b. Transactions exceeding 3,000,000
expensive_transactions = sales_df[sales_df['TotalPrice'] > 3000000]
print("Transactions exceeding 3,000,000:\n", expensive_transactions)

# c. Sales in 2015 and 1973
# Assuming you have a 'Date' column in your dataset (replace 'InvoiceDate' if different)
# Convert 'InvoiceDate' to datetime objects if necessary
if not pd.api.types.is_datetime64_any_dtype(sales_df['InvoiceDate']):
    sales_df['InvoiceDate'] = pd.to_datetime(sales_df['InvoiceDate'])

sales_2015 = sales_df[sales_df['InvoiceDate'].dt.year == 2015]
sales_1973 = sales_df[sales_df['InvoiceDate'].dt.year == 1973]

print("\nSales in 2015:\n", sales_2015)
print("\nSales in 1973:\n", sales_1973)

# d. Clients established before 2001
# Assuming you have a 'ClientEstablishedDate' column
# Convert to datetime if it's not already
if not pd.api.types.is_datetime64_any_dtype(sales_df['ClientEstablishedDate']):
    sales_df['ClientEstablishedDate'] = pd.to_datetime(sales_df['ClientEstablishedDate'])

old_clients = sales_df[sales_df['ClientEstablishedDate'].dt.year < 2001]
print("\nClients established before 2001:\n", old_clients)

In [None]:
# prompt: Use “automobile.csv” to perform following tasks in
# python
# a. group this dataset on the basis of the body-style
# column
# b. print the values contained in that group that
# have the body-style value of sedan
# c. Print the composite group using feature “bodystyle” and “drive-wheels”
# d. Group the data frame df by body-style and drivewheels and extract length, height and price from
# each group

import pandas as pd

# Load the automobile dataset
try:
    df = pd.read_csv('automobile.csv')
except FileNotFoundError:
    print("Error: 'automobile.csv' not found. Please provide the correct path.")
    exit()


# a. Group by body-style
body_style_groups = df.groupby('body-style')

# b. Print values for 'sedan' body-style
print("Values for Sedan body-style:")
print(body_style_groups.get_group('sedan'))

# c. Composite group by body-style and drive-wheels
composite_group = df.groupby(['body-style', 'drive-wheels'])
print("\nComposite group by body-style and drive-wheels:")
print(composite_group.size().unstack(fill_value=0))

# d. Group and extract length, height, and price
grouped = df.groupby(['body-style', 'drive-wheels'])
extracted_features = grouped['length', 'height', 'price'].agg(['mean', 'max', 'min'])
print("\nExtracted length, height, and price:")
extracted_features

In [None]:
# prompt: Write a code in python for “titanic” dataset for the
# following:-
# a. find out the percentages of women and men who
# survived the disaster
# b. visualize this information using the survival
# numbers of males and females
# c. visualize the number of survivors and deaths
# from different Pclasses
# d. multivariate analysis on the Titanic dataset using
# the Survived, Pclass, Fear, and Age variables:

# ... (Your existing code)

# a. Percentage of women and men who survived
women_survived = titanic_df[(titanic_df['Sex_male'] == 0) & (titanic_df['Survived'] == 1)]
men_survived = titanic_df[(titanic_df['Sex_male'] == 1) & (titanic_df['Survived'] == 1)]

total_women = titanic_df[titanic_df['Sex_male'] == 0]
total_men = titanic_df[titanic_df['Sex_male'] == 1]

women_survival_percentage = (len(women_survived) / len(total_women)) * 100
men_survival_percentage = (len(men_survived) / len(total_men)) * 100

print(f"Women survival percentage: {women_survival_percentage:.2f}%")
print(f"Men survival percentage: {men_survival_percentage:.2f}%")


# b. Visualize survival numbers of males and females
plt.figure(figsize=(8, 6))
plt.bar(['Women', 'Men'], [len(women_survived), len(men_survived)])
plt.title('Survival Numbers by Gender')
plt.xlabel('Gender')
plt.ylabel('Number of Survivors')
plt.show()


# c. Visualize survivors and deaths from different Pclasses
survived_by_pclass = titanic_df.groupby('Pclass')['Survived'].sum()
deaths_by_pclass = titanic_df.groupby('Pclass')['Survived'].count() - survived_by_pclass

plt.figure(figsize=(8, 6))
plt.bar(survived_by_pclass.index, survived_by_pclass.values, label='Survived')
plt.bar(deaths_by_pclass.index, deaths_by_pclass.values, bottom=survived_by_pclass.values, label='Deaths')
plt.xlabel('Pclass')
plt.ylabel('Number of Passengers')
plt.title('Survivors and Deaths by Pclass')
plt.legend()
plt.show()

# d. Multivariate analysis
# (Example: Correlation matrix)
multivariate_df = titanic_df[['Survived', 'Pclass', 'Fare', 'Age']]  # Select variables for analysis
correlation_matrix = multivariate_df.corr()

print("Correlation matrix:")
print(correlation_matrix)

# Visualization using a heatmap
import seaborn as sns

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Titanic Dataset')
plt.show()