In [None]:
# Setup: Install and configure Kaggle API
!pip install -q kaggle
#Installs the kaggle Python package so you can access datasets directly
# from Kaggle competitions using code.

from google.colab import files
files.upload()  # Upload kaggle.json

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
#
!kaggle competitions download -c titanic
!unzip -q titanic.zip


In [None]:
from google.colab import files
files.upload()
 # Upload kaggle.json# ✅ 2. Import libraries and load data
import pandas as pd
import numpy as np

titanic_data = pd.read_csv('train.csv')
print(titanic_data.head(6))



In [None]:
print(titanic_data.info())


In [None]:
print(titanic_data.describe())

In [None]:
# Simulate extra demographic data (for integration example)
demographics = titanic_data[['PassengerId']].copy()
demographics['Has_Pets'] = np.random.choice([True, False], size=len(demographics))

# Merge
titanic_data = pd.merge(titanic_data, demographics, on='PassengerId', how='left')
print(titanic_data[['PassengerId', 'Has_Pets']].head())

# The LEFT JOIN keyword returns all records from the left table (table1), and the matching records from the right table (table2).


In [None]:
## 2.Add a duplicate of the first row
titanic_data = pd.concat([titanic_data, titanic_data.iloc[[0]]], ignore_index=True)

# Check for duplicate rows
duplicate_mask = titanic_data.duplicated()

# Print number of duplicate rows
print("Number of duplicate rows:", duplicate_mask.sum())

# Optionally, view the duplicated rows
print(titanic_data[duplicate_mask])


In [None]:
# Remove Duplicates
titanic_data = titanic_data.drop_duplicates()
print("Remaining duplicates:", titanic_data.duplicated().sum())

In [None]:

# Remove Irrelevant Columns
titanic_data = titanic_data.drop(['Cabin', 'Ticket'], axis=1, errors='ignore')
print("Columns after removal:", titanic_data.columns)

In [None]:
# Fixing Structural Errors - simulate 'Date' column
titanic_data['Date'] = pd.date_range(start='1/1/1912', periods=len(titanic_data), freq='D')
titanic_data['Date'] = pd.to_datetime(titanic_data['Date'])
print(titanic_data[['Date']].head())

import pandas as pd

In [None]:




# Check for missing values in a DataFrame
missing_data = titanic_data.isnull()
print(missing_data.head())

# Count missing values in each column
missing_counts = titanic_data.isnull().sum()
print(missing_counts)

In [None]:



# Identify Missing
print("Missing values per column:")
print(titanic_data.isnull().sum())

In [None]:
# Identify Missing
print("Missing values per column:")
print(titanic_data.isnull().sum())

# Fill with mean
titanic_filled = titanic_data.copy()
titanic_filled['Age'] = titanic_filled['Age'].fillna(titanic_filled['Age'].mean())
print("Nulls in Age after filling:", titanic_filled['Age'].isnull() .sum())


# Drop rows with missing values
titanic_dropna = titanic_data.dropna()
print("Shape after dropping rows with missing data:", titanic_dropna.shape)


KNNImputer is a technique for filling in missing values based on the values of the k-nearest neighbors (rows that are similar in other features). It’s more intelligent than filling with mean or median, because it accounts for relationships between features.

✅ Create the imputer object.

n_neighbors=3 means for each missing value, the algorithm will:

Find the 3 rows most similar to the one with missing data (based on other numeric columns).

Use their values to compute the average and fill in the missing value.


 Filter only numeric columns, because KNNImputer can only work with numbers.
This ensures we don’t feed in strings or categories like 'Sex' or 'Embarked'.


Perform the imputation:

fit_transform() does two things:

Fit: Learns how to find neighbors based on available (non-missing) data.

Transform: Fills in the missing values using neighbor-based averages.

The result is a NumPy array with all missing values filled.

✅ Convert the result back to a DataFrame, keeping the original column names.

✅ Just to check: Print the first few rows of the cleaned, imputed data.

In [None]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=3)
df_numeric = titanic_data.select_dtypes(include=['float64', 'int64'])
imputed_array = imputer.fit_transform(df_numeric)
df_imputed = pd.DataFrame(imputed_array, columns=df_numeric.columns)
print(df_imputed.head())


In [None]:
Q1 = titanic_data['Fare'].quantile(0.25)
Q3 = titanic_data['Fare'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

filtered_fare = titanic_data[(titanic_data['Fare'] >= lower_bound) & (titanic_data['Fare'] <= upper_bound)]
print("Original shape:", titanic_data.shape)
print("Shape after removing fare outliers:", filtered_fare.shape)


What’s going on?
Q1 (25th percentile) is the value below which 25% of the data falls.

Q3 (75th percentile) is the value below which 75% of the data falls.

IQR (Interquartile Range) is the difference between Q3 and Q1:

IQR
=
𝑄
3
−
𝑄
1
IQR=Q3−Q1
It measures the spread of the middle 50% of the data.


What’s going on?
These formulas define the acceptable range for Age.

Any value outside this range is considered an outlier.

Lower Bound
=
𝑄
1
−
1.5
×
𝐼
𝑄
𝑅
Lower Bound=Q1−1.5×IQR
Upper Bound
=
𝑄
3
+
1.5
×
𝐼
𝑄
𝑅
Upper Bound=Q3+1.5×IQR
This "1.5 * IQR" rule is a common statistical convention:

Too far below = unusually small → outlier

Too far above = unusually large → outlier

In [None]:
# 1. Q1 and Q3
Q1_age = titanic_data['Age'].quantile(0.25)
Q3_age = titanic_data['Age'].quantile(0.75)
IQR_age = Q3_age - Q1_age

# 2. Bounds
lower_age = Q1_age - 1.5 * IQR_age
upper_age = Q3_age + 1.5 * IQR_age

# 3. Count Outliers
age_outliers = titanic_data[(titanic_data['Age'] < lower_age) | (titanic_data['Age'] > upper_age)]
print("Number of Age outliers:", age_outliers.shape[0])
print(age_outliers[['Age']].head())


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

# Create a sample dataset
np.random.seed(42)
data = {
    'CustomerID': range(1, 11),
    'Age': np.random.randint(18, 70, 10),
    'Annual_Income': np.random.randint(30000, 120000, 10),
    'Spending_Score': np.random.randint(1, 100, 10),
    'Purchase_Amount': np.random.randint(100, 1000, 10),
    'Purchase_Date': pd.date_range(start='2024-01-01', periods=10, freq='15D')
}
df = pd.DataFrame(data)
df




✅ 1. Data Normalization
What it is: Rescaling features to a standard range (typically 0 to 1) so that no feature dominates others due to its scale.

Why it's important: Algorithms like KNN, clustering, and PCA are sensitive to scale — normalization helps them treat all features equally.

In the example: We normalized Age, Annual_Income, and Spending_Score using MinMaxScaler to create new columns ending in _norm.



In [None]:

# 1. Data Normalization (Min-Max Scaling)
scaler = MinMaxScaler()
df[['Age_norm', 'Annual_Income_norm', 'Spending_Score_norm']] = scaler.fit_transform(
    df[['Age', 'Annual_Income', 'Spending_Score']]
)
df


✅ 2. Data Reduction using PCA
What it is: Reduces the number of columns (features) while keeping most of the information.

Why it's important: Makes data easier to visualize and can improve performance in high-dimensional datasets.

In the example: We used PCA to reduce the normalized features to two components (PCA1, PCA2) — these represent the most meaningful variance in the data.

In [None]:

# 2. Data Reduction using PCA
pca = PCA(n_components=2)
pca_features = pca.fit_transform(df[['Age_norm', 'Annual_Income_norm', 'Spending_Score_norm']])
df[['PCA1', 'PCA2']] = pca_features
df

In [None]:
print(pca.explained_variance_ratio_)


In [None]:
import matplotlib.pyplot as plt

# Plotting PCA1 vs PCA2 with labels
plt.figure(figsize=(8, 6))
plt.scatter(df['PCA1'], df['PCA2'], c='blue', edgecolors='k', s=80)
plt.title('PCA: Data Projection onto 2 Principal Components')
plt.xlabel('Principal Component 1 (PCA1)')
plt.ylabel('Principal Component 2 (PCA2)')
plt.grid(True)
plt.tight_layout()
plt.show()


✅ 3. Data Aggregation
What it is: Summarizing data — typically grouping by a category or time period and applying aggregation functions (e.g., sum, mean).

Why it's important: Allows simplification of raw data for trend analysis, reporting, or dashboard creation.

In the example: We aggregated the Purchase_Amount by month using .groupby() to see how total purchases vary over time.

In [None]:

# 3. Data Aggregation - Group by Month and calculate total Purchase_Amount
df['Month'] = df['Purchase_Date'].dt.to_period('M')
monthly_agg = df.groupby(['Month', 'CustomerID']).agg(
    Purchase_Amount_min=('Purchase_Amount', 'min'),
    Purchase_Amount_max=('Purchase_Amount', 'max')
).reset_index()

print(monthly_agg)


✅ 1. Data Normalization
What it is: Rescaling features to a standard range (typically 0 to 1) so that no feature dominates others due to its scale.

Why it's important: Algorithms like KNN, clustering, and PCA are sensitive to scale — normalization helps them treat all features equally.

In the example: We normalized Age, Annual_Income, and Spending_Score using MinMaxScaler to create new columns ending in _norm.



In [None]:
from google.colab import files
files.upload()
 # Upload kaggle.json# ✅ 2. Import libraries and load data
import pandas as pd
import numpy as np

weather_data = pd.read_csv('weatherHistory.csv')
print(weather_data.head(6))



In [None]:
print(weather_data.info())

In [None]:
print(weather_data.describe())

In [None]:
weather_data.head()

In [None]:
# 1. Data Normalization (Min-Max Scaling)
scaler = MinMaxScaler()
weather_data[['Temperature (C)_norm', 'Wind Speed (km/h)_norm', 'Pressure (millibars)_norm']] = scaler.fit_transform(
    weather_data[['Temperature (C)', 'Wind Speed (km/h)', 'Pressure (millibars)']]
)
weather_data

In [None]:
# 2. Data Reduction using PCA
pca = PCA(n_components=2)
pca_features = pca.fit_transform(weather_data[['Temperature (C)_norm', 'Wind Speed (km/h)_norm', 'Pressure (millibars)_norm']])
weather_data[['PCA1', 'PCA2']] = pca_features
weather_data

In [None]:
import matplotlib.pyplot as plt
# Plotting PCA1 vs PCA2 with labels
plt.figure(figsize=(8, 6))
plt.scatter(weather_data['PCA1'], weather_data['PCA2'], c='blue', edgecolors='k', s=80)
plt.title('PCA: Data Projection onto 2 Principal Components')
plt.xlabel('Principal Component 1 (PCA1)')
plt.ylabel('Principal Component 2 (PCA2)')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
 # 3. Data Aggregation - Group by Month and calculate the average Temperature (C)
weather_data['Formatted Date'] = pd.to_datetime(weather_data['Formatted Date'], utc=True)
weather_data['Month'] = weather_data['Formatted Date'].dt.to_period('M')

daily_agg = weather_data.groupby('Month').agg({'Temperature (C)': 'mean'})
daily_agg


In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA


# 1. Data Normalization
scaler = MinMaxScaler()
numerical_cols = weather_data.select_dtypes(include='number').columns
weather_data_normalized = scaler.fit_transform(weather_data[numerical_cols])
weather_data[[col + '_norm' for col in numerical_cols]] = pd.DataFrame(weather_data_normalized, columns=[col + '_norm' for col in numerical_cols])
weather_data

# 2. PCA for dimensionality reduction
pca = PCA(n_components=2)
pca_result = pca.fit_transform(weather_data[[col + '_norm' for col in numerical_cols]])
weather_data[['PCA1', 'PCA2']] = pca_result

# 3. Aggregation: Average temperature and humidity by summary
weather_agg = weather_data.groupby('Daily Summary').agg(
    Avg_Temp=('Apparent Temperature (C)', 'mean'),
    Avg_Humidity=('Humidity', 'mean')
).reset_index()

