In [27]:
import pandas as pd
import altair as alt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Set pandas options to display all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Load the Excel file into Pandas DataFrames
df_glass = pd.read_excel('D:/Data science ass/14/Random Forest/glass.xlsx', sheet_name='glass')
df_description = pd.read_excel('D:/Data science ass/14/Random Forest/glass.xlsx', sheet_name='Description')

# Display the first 5 rows of each DataFrame
print("First 5 rows of glass DataFrame:")
print(df_glass.head())

print("\nFirst 5 rows of description DataFrame:")
print(df_description.head())

# Print the column names and their data types for each DataFrame
print("\nColumn names and their data types for glass DataFrame:")
print(df_glass.info())

print("\nColumn names and their data types for description DataFrame:")
print(df_description.info())

# Check for missing values
missing_values = df_glass.isnull().sum()

# Print the count of missing values for each column
print("\nMissing Values per column:")
print(missing_values)

# Get summary statistics
summary_stats = df_glass.describe()

# Print the summary statistics
print("\nSummary Statistics:")
print(summary_stats)

# Create boxplots for all columns except `Type`
columns_to_plot = df_glass.columns.drop('Type')
boxplots = alt.Chart(df_glass.melt(value_vars=columns_to_plot), title='Boxplots for Glass Attributes').mark_boxplot().encode(
    x=alt.X('variable:N', title='Attribute'),
    y=alt.Y('value:Q'),
    tooltip=['variable', 'value']
).interactive()

# Display the boxplots
boxplots.display()

# Calculate Q1, Q3, and IQR for each column
Q1 = df_glass.drop('Type', axis=1).quantile(0.25)
Q3 = df_glass.drop('Type', axis=1).quantile(0.75)
IQR = Q3 - Q1

# Define lower and upper bounds for outlier detection
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Replace outliers with bounds
for col in df_glass.columns.drop('Type'):
    outliers = (df_glass[col] < lower_bound[col]) | (df_glass[col] > upper_bound[col])
    num_outliers = outliers.sum()
    df_glass.loc[outliers, col] = df_glass[col].clip(lower_bound[col], upper_bound[col])

    print(f"Number of outliers replaced in {col}: {num_outliers}")

# Separate features and target variable
X = df_glass.drop('Type', axis=1)
y = df_glass['Type']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Print shapes of training and testing sets
print(f"\nShape of X_train: {X_train_scaled.shape}")
print(f"Shape of X_test: {X_test_scaled.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")

# Initialize and train the Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Print the class distribution
print("\nClass Distribution:")
print(y.value_counts())

# Initialize and train a new Random Forest model with class_weight='balanced'
rf_model_balanced = RandomForestClassifier(random_state=42, class_weight='balanced')
rf_model_balanced.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred_balanced = rf_model_balanced.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Evaluate the model with class_weight='balanced'
accuracy_balanced = accuracy_score(y_test, y_pred_balanced)
precision_balanced = precision_score(y_test, y_pred_balanced, average='weighted', zero_division=1)
recall_balanced = recall_score(y_test, y_pred_balanced, average='weighted')
f1_balanced = f1_score(y_test, y_pred_balanced, average='weighted')

print(f"\nAccuracy with class_weight='balanced': {accuracy_balanced:.2f}")
print(f"Precision with class_weight='balanced': {precision_balanced:.2f}")
print(f"Recall with class_weight='balanced': {recall_balanced:.2f}")
print(f"F1 Score with class_weight='balanced': {f1_balanced:.2f}")

# Evaluate the Bagging model
accuracy_bagging = accuracy_score(y_test, y_pred_bagging)
precision_bagging = precision_score(y_test, y_pred_bagging, average='weighted', zero_division=1)
recall_bagging = recall_score(y_test, y_pred_bagging, average='weighted')
f1_bagging = f1_score(y_test, y_pred_bagging, average='weighted')

print(f"\nAccuracy with Bagging: {accuracy_bagging:.2f}")
print(f"Precision with Bagging: {precision_bagging:.2f}")
print(f"Recall with Bagging: {recall_bagging:.2f}")
print(f"F1 Score with Bagging: {f1_bagging:.2f}")

# Evaluate the Boosting model
accuracy_boosting = accuracy_score(y_test, y_pred_boosting)
precision_boosting = precision_score(y_test, y_pred_boosting, average='weighted', zero_division=1)
recall_boosting = recall_score(y_test, y_pred_boosting, average='weighted')
f1_boosting = f1_score(y_test, y_pred_boosting, average='weighted')

print(f"\nAccuracy with Boosting: {accuracy_boosting:.2f}")
print(f"Precision with Boosting: {precision_boosting:.2f}")
print(f"Recall with Boosting: {recall_boosting:.2f}")
print(f"F1 Score with Boosting: {f1_boosting:.2f}")


First 5 rows of glass DataFrame:
        RI     Na    Mg    Al     Si     K    Ca   Ba   Fe  Type
0  1.52101  13.64  4.49  1.10  71.78  0.06  8.75  0.0  0.0     1
1  1.51761  13.89  3.60  1.36  72.73  0.48  7.83  0.0  0.0     1
2  1.51618  13.53  3.55  1.54  72.99  0.39  7.78  0.0  0.0     1
3  1.51766  13.21  3.69  1.29  72.61  0.57  8.22  0.0  0.0     1
4  1.51742  13.27  3.62  1.24  73.08  0.55  8.07  0.0  0.0     1

First 5 rows of description DataFrame:
  Prepare a model for glass classification using Random Forest
0                                  Data Description:          
1                              RI : refractive index          
2  Na: Sodium (unit measurement: weight percent i...          
3                                      Mg: Magnesium          
4                                       AI: Aluminum          

Column names and their data types for glass DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214 entries, 0 to 213
Data columns (total 10 columns)

Number of outliers replaced in RI: 17
Number of outliers replaced in Na: 7
Number of outliers replaced in Mg: 0
Number of outliers replaced in Al: 18
Number of outliers replaced in Si: 12
Number of outliers replaced in K: 7
Number of outliers replaced in Ca: 26
Number of outliers replaced in Ba: 38
Number of outliers replaced in Fe: 12

Shape of X_train: (171, 9)
Shape of X_test: (43, 9)
Shape of y_train: (171,)
Shape of y_test: (43,)
Accuracy: 0.81
Precision: 0.84
Recall: 0.81
F1 Score: 0.81

Class Distribution:
2    76
1    70
7    29
3    17
5    13
6     9
Name: Type, dtype: int64
Accuracy: 0.81
Precision: 0.84
Recall: 0.81
F1 Score: 0.81

Accuracy with class_weight='balanced': 0.79
Precision with class_weight='balanced': 0.82
Recall with class_weight='balanced': 0.79
F1 Score with class_weight='balanced': 0.78

Accuracy with Bagging: 0.77
Precision with Bagging: 0.77
Recall with Bagging: 0.77
F1 Score with Bagging: 0.76

Accuracy with Boosting: 0.35
Precision with Boosting: 0.38
R

1. Explain Bagging and Boosting Methods. How are they different from each other?

Bagging (Bootstrap Aggregating):

Description: Bagging is an ensemble method that aims to improve the stability and accuracy of machine learning models. It involves generating multiple subsets of the training data by random sampling with replacement (bootstrap sampling). A model (often a decision tree) is trained on each subset, and the final prediction is made by aggregating the predictions of all models, usually by voting (for classification) or averaging (for regression).

Key Characteristics:

Reduces variance: Bagging helps reduce overfitting by averaging out the predictions of the models.

Models are trained independently of each other.

Common example: Random Forest, where multiple decision trees are bagged.

Boosting:

Description: Boosting is another ensemble technique that sequentially trains models, with each new model attempting to correct the errors made by the previous ones. Unlike bagging, boosting focuses on training models sequentially, with each model in the sequence giving more weight to the misclassified samples from the previous model. The final prediction is made by combining the predictions of all models, often using weighted voting or averaging.

Key Characteristics:

Reduces bias: Boosting improves model performance by focusing on difficult-to-predict instances.

Models are trained sequentially, with each one learning from the mistakes of the previous ones.

Common examples: AdaBoost, Gradient Boosting Machines (GBM), XGBoost.

Differences Between Bagging and Boosting:

- Model Independence:

Bagging: Models are trained independently of each other.

Boosting: Models are trained sequentially, with each model dependent on the performance of the previous ones.

- Focus:

Bagging: Aims to reduce variance by averaging the predictions of many models.

Boosting: Aims to reduce bias by focusing on and correcting the errors of previous models.

- Handling of Errors:

Bagging: Each model has an equal say in the final prediction.

Boosting: Later models are given more importance in the final prediction, especially for the samples that were misclassified by earlier models.

- Training Process:

Bagging: All models are trained in parallel.

Boosting: Models are trained sequentially.

2. Explain How to Handle Imbalance in the Data

Imbalanced data refers to situations where one class significantly outnumbers the other(s) in a classification problem. Handling this imbalance is crucial because it can lead to biased models that perform poorly on the minority class.

Techniques to Handle Imbalanced Data:

- Resampling Techniques:

Oversampling the Minority Class:
Increase the number of instances of the minority class by duplicating existing instances or by generating synthetic examples using methods like SMOTE (Synthetic Minority Over-sampling Technique).

Undersampling the Majority Class:
Reduce the number of instances of the majority class by randomly removing instances to balance the class distribution. This method may lead to loss of important information.

- Using Different Metrics:

Evaluation Metrics: Accuracy is not a good measure in cases of imbalanced data. Instead, use metrics like Precision, Recall, F1-Score, or the Area Under the Precision-Recall Curve (AUC-PR).
Confusion Matrix: Use a confusion matrix to better understand the performance on each class.

- Cost-Sensitive Learning:

Adjusting Model Parameters: Some algorithms allow you to assign higher misclassification costs to the minority class, which helps the model to be more cautious about misclassifying the minority class.
Custom Loss Functions: You can design custom loss functions that penalize the model more for misclassifying the minority class.

- Ensemble Methods:

Balanced Random Forest: An extension of Random Forest where each tree is trained on a balanced bootstrapped sample.
Boosting with Class Weights: Algorithms like AdaBoost, XGBoost, or LightGBM can be adapted to handle imbalanced data by assigning higher weights to the minority class during training.

- Anomaly Detection:

Treating the Minority Class as Anomalies: In some cases, the minority class can be treated as an anomaly or outlier, and anomaly detection techniques can be used to identify these instances.

- Generating Synthetic Data:

SMOTE (Synthetic Minority Over-sampling Technique): Generates synthetic examples by interpolating between existing minority class instances. This helps in making the minority class more representative without simply duplicating instances.

- Hybrid Methods:

Combination of Oversampling and Undersampling: Use a combination of both techniques to create a balanced dataset that retains important information from both classes.