<a href="https://colab.research.google.com/github/sanyasirao-surada/ML-LAB/blob/main/ML_LAB_EXP_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Step 1: Import Required Libraries**

In [None]:
# Step 1: Import Required Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

**Step 2: Load the Titanic Dataset**

In [None]:
# Load Titanic dataset from seaborn
df = sns.load_dataset('titanic')

In [None]:
print("Dataset Overview:\n", df.head(), df.tail())

**Step 3: Save Dataset to Excel and CSV**

In [None]:
# Save to Excel
# index=False avoids writing row numbers to Excel.
df.to_excel('titanic_dataset.xlsx', index=False)

In [None]:
# Save as CSV
df.to_csv('titanic_dataset.csv', index=False)

**Step 4: Download Files Locally in Colab**

In [None]:
# Download the file to your local system
# files.download() will prompt you to download the file in Colab.
from google.colab import files
files.download('titanic_dataset.xlsx')

In [None]:
# Download the file
files.download('titanic_dataset.csv')

**Step 5: Upload Files Back into Colab**

In [None]:
# This will open a file upload box where you can choose your .csv or .xlsx file from your computer.
from google.colab import files
uploaded = files.upload()

**Step 6: Read Uploaded File into a DataFrame**

In [None]:
# Replace 'yourfile.csv' with the actual filename
df = pd.read_csv('/content/titanic_dataset.csv')

In [None]:
# Replace 'yourfile.xlsx' with your Excel file name
df = pd.read_excel('/content/titanic_dataset.xlsx')

**Step 7: Mount & Access Google Drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

**7a: Load Specific Sheets from an Excel Workbook**

In [None]:
excel_path= "/content/drive/MyDrive/1.ML LAB CODES AND DATASETS/titanic_dataset 18-07-25.xlsx"

In [None]:
# Load a specific sheet
df = pd.read_excel(excel_path, sheet_name='EXP-1')  # replace 'Sheet1' with your actual sheet name

In [None]:
xls = pd.ExcelFile(excel_path)
print(xls.sheet_names)  # Lists all available sheets

# Then load the one you want
df = xls.parse('EXP-1')

In [None]:
csv_path = '/content/drive/MyDrive/1.ML LAB CODES AND DATASETS/titanic_dataset from seaborn.csv'
df = pd.read_csv(csv_path)

** 7b: Save CSV/Excel Back to Google Drive**

In [None]:
save_path = '/content/drive/MyDrive/1.ML LAB CODES AND DATASETS/output.csv'
df.to_csv(save_path, index=False)

In [None]:
save_path = '/content/drive/MyDrive/1.ML LAB CODES AND DATASETS/output.xlsx'
df.to_excel(save_path, index=False)

**Step 8: Load from a GitHub Raw URL**

In [None]:
import pandas as pd

url = 'https://raw.githubusercontent.com/sanyasirao-surada/research_portal/refs/heads/main/titanic_dataset%20from%20seaborn.csv'
df = pd.read_csv(url)

**Step 9: Basic Data Exploration**

In [None]:
df.shape

In [None]:
print("Dataset Overview:\n", df.head(), df.tail())

In [None]:
print("\nSummary Statistics:\n", df.describe(include='all'))

In [None]:
print("\nData Types:\n", df.dtypes)

In [None]:
print("\nSample Slice:\n", df.iloc[6:16, 2:4])

**9.1.Step-by-Step Code to Clean Titanic Dataset**

In [None]:
df.info()

In [None]:
print("\nMissing Values (Before Cleaning):")
print(df.isnull().sum())

In [None]:
Drop irrelevant columns (optional based on experiment goal)
df = df.drop(columns=['deck', 'embark_town', 'alive', 'class', 'who', 'adult_male', 'alone'])

In [None]:
Fill missing values
# Fill age with mean
df['age'] = df['age'].fillna(df['age'].mean())

In [None]:
# Fill embarked with mode
df['embarked'] = df['embarked'].fillna(df['embarked'].mode()[0])

In [None]:
# Drop rows with any remaining missing values
df = df.dropna()

In [None]:
# Convert categorical columns into numeric using encoding (optional)
df['sex'] = df['sex'].map({'male': 0, 'female': 1})
df['embarked'] = df['embarked'].map({'S': 0, 'C': 1, 'Q': 2})

In [None]:
# View cleaned data info
print("\nCleaned Dataset Info:")
print(df.info())

In [None]:
# View missing values after cleaning
print("\nMissing Values (After Cleaning):")
print(df.isnull().sum())

In [None]:
# Save cleaned dataset to CSV and Excel
df.to_csv("cleaned_titanic.csv", index=False)
df.to_excel("cleaned_titanic.xlsx", index=False)

In [None]:
# Download files (for Google Colab)
from google.colab import files
files.download("cleaned_titanic.csv")
files.download("cleaned_titanic.xlsx")

**Step 10: Data Preparation Activities**

In [None]:
# 10a. Filtering survivors
print("\nFiltering passengers who survived:")
survived = df[df['survived'] == 1]
print(survived[['sex', 'age', 'pclass', 'survived']].head())

In [None]:
# 10b. Grouping & counting survivors by gender & class
print("\n Grouping by 'sex' and 'pclass' to count survivors:")
grouped = df.groupby(['sex', 'pclass'])['survived'].sum()
print(grouped)

In [None]:
# 10c. Sorting by age (showing youngest via 'who')
print("\nSorting passengers by age:")
sorted_df = df.sort_values(by='age')
print(sorted_df[['age', 'sex']].head())

**Step 11: Data Visualization**

In [None]:
plt.figure(figsize=(15,10))

# Bar plot of survival by gender
plt.subplot(2,3,1)
sns.countplot(x='sex', hue='survived', data=df)
plt.title("Survival Count by Gender")

In [None]:
# Histogram of age
plt.subplot(2,3,2)
df['age'].dropna().hist(bins=20, edgecolor='g')
plt.title("Age Distribution")

In [None]:
# Distribution Plot: Fare
plt.subplot(2,3,3)
sns.histplot(df['fare'], kde=True)
plt.title("Fare Distribution with KDE")

In [None]:
# Box plot of age by class
plt.subplot(2,3,4)
sns.boxplot(x='pclass', y='age', data=df)
plt.title("Box Plot of Age by Class")

In [None]:
# Scatter plot of age vs fare colored by survival
plt.subplot(2,3,5)
sns.scatterplot(x='age', y='fare', hue='survived', data=df)
plt.title("Age vs Fare (Survival)")

plt.tight_layout()
plt.show()

**Step 12: Extended Visualizations**

In [None]:
# Survival rate by class (barplot)
sns.barplot(data=df, x='pclass', y='survived')

In [None]:
# Pie chart of overall survival distribution
plt.pie(df['survived'].value_counts(), labels=['No','Yes'], autopct='%1.1f%%')

In [None]:
# Violin plot: age & survival by gender
sns.violinplot(data=df, x='sex', y='age', hue='survived', split=True)

In [None]:
# Correlation heatmap (numeric only)
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap='coolwarm')

In [None]:
# FacetGrid: age distribution by survival & gender
g = sns.FacetGrid(df, col="survived", row="sex", margin_titles=True)
g.map_dataframe(sns.histplot, x="age", bins=20)

In [None]:
# Scatter with class styling
sns.scatterplot(data=df, x='age', y='fare', hue='pclass', style='survived')

In [None]:
# Swarm plot of age by class & gender
sns.swarmplot(data=df, x='pclass', y='age', hue='sex')

In [None]:
################################################################################
# Basic Data Exploration
################################################################################
print("Dataset Head:\n", df.head(), "\n")
print("Dataset Tail:\n", df.tail(), "\n")
print("Summary Statistics:\n", df.describe(include='all'), "\n")
print("Missing Values:\n", df.isnull().sum(), "\n")
print("Column Data Types:\n", df.dtypes, "\n")
df.info()

In [None]:
################################################################################
# Data Visualization (multiple subplots)
################################################################################
plt.figure(figsize=(15, 10))

# 11a. Bar Plot: Survival count by gender
plt.subplot(2, 3, 1)
sns.countplot(x='sex', hue='survived', data=df)
plt.title("Survival Count by Gender")

# 11b. Histogram: Age distribution
plt.subplot(2, 3, 2)
df['age'].dropna().hist(bins=20, edgecolor='black')
plt.title("Age Distribution")

# 11c. KDE Histogram: Fare distribution
plt.subplot(2, 3, 3)
sns.histplot(df['fare'], kde=True)
plt.title("Fare Distribution with KDE")

# 11d. Box Plot: Age vs Class
plt.subplot(2, 3, 4)
sns.boxplot(x='pclass', y='age', data=df)
plt.title("Box Plot of Age by Class")

# 11e. Scatter Plot: Age vs Fare colored by survival
plt.subplot(2, 3, 5)
sns.scatterplot(x='age', y='fare', hue='survived', data=df)
plt.title("Age vs Fare (Survival)")

plt.tight_layout()
plt.show()

In [None]:
# Survival rate by passenger class
sns.barplot(data=df, x='class', y='survived')
plt.title("Survival Rate by Class")
plt.show()

# Pie chart of overall survival distribution
survived_counts = df['survived'].value_counts()
plt.pie(survived_counts, labels=['Did Not Survive','Survived'], autopct='%1.1f%%')
plt.title("Survival Distribution")
plt.show()

# Violin plot: Age & survival by gender
sns.violinplot(data=df, x='sex', y='age', hue='survived', split=True)
plt.title("Age & Survival by Gender")
plt.show()

# Correlation heatmap (numeric features)
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()