<a href="https://colab.research.google.com/github/sarveshgadkari/ML/blob/main/ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Linear Regression	**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# Load data
df = pd.read_csv("C:/Users/Lenovo/Desktop/ML/Untitled Folder/Housing.csv")

# Encode binary categorical columns
binary_cols = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']
for col in binary_cols:
    df[col] = df[col].map({'yes': 1, 'no': 0})

# One-hot encode 'furnishingstatus'
df = pd.get_dummies(df, columns=['furnishingstatus'], drop_first=True)

# Features and target
X = df.drop(columns=['price'])
y = df['price']

# Split data
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LinearRegression()
model.fit(x_train, y_train)

# Predict
y_pred = model.predict(x_test)

# Evaluate
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R-squared Score:", r2)

# Results DataFrame
result = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(result)

# Plot Actual vs Predicted
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color='blue', alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linewidth=2)
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs Predicted Prices (Test Data)')
plt.grid(True)
plt.show()


 **DecisionTree**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Load your CSV file
df = pd.read_csv('titanic.csv')  # Make sure the path is correct

# Drop columns you don't want
df = df.drop(['age', 'embarked', 'class', 'who', 'deck', 'adult_male', 'embark_town', 'alive'], axis=1)

# Drop missing values
df = df.dropna()

# Convert categorical variables to numeric
df['sex'] = pd.get_dummies(df['sex'], drop_first=True)
df['alone'] = pd.get_dummies(df['alone'], drop_first=True)

# Features and target split
X = df.drop('survived', axis=1)
y = df['survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 1. No pruning
m1 = DecisionTreeClassifier(random_state=42)
m1.fit(X_train, y_train)
y1_pred = m1.predict(X_test)
print("Accuracy Score (No Pruning):", accuracy_score(y_test, y1_pred))
plt.figure(figsize=(20, 10))
plot_tree(m1, feature_names=X.columns, class_names=['Not Survived', 'Survived'], filled=True)
plt.title("Decision Tree without Pruning")
plt.show()

# 2. Pre-pruning
m2 = DecisionTreeClassifier(max_depth=5, min_samples_split=10, random_state=42)
m2.fit(X_train, y_train)
y2_pred = m2.predict(X_test)
print("Accuracy Score (Pre-Pruning):", accuracy_score(y_test, y2_pred))
plt.figure(figsize=(20, 10))
plot_tree(m2, feature_names=X.columns, class_names=['Not Survived', 'Survived'], filled=True)
plt.title("Decision Tree with Pre-Pruning")
plt.show()

# 3. Post-pruning
m3 = DecisionTreeClassifier(random_state=42)
m3.fit(X_train, y_train)
path = m3.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
models = [DecisionTreeClassifier(random_state=42, ccp_alpha=alpha).fit(X_train, y_train) for alpha in ccp_alphas]
accuracies = [accuracy_score(y_test, model.predict(X_test)) for model in models]
best_alpha = ccp_alphas[np.argmax(accuracies)]
best_model = DecisionTreeClassifier(random_state=42, ccp_alpha=best_alpha)
best_model.fit(X_train, y_train)
y3_pred = best_model.predict(X_test)
print("Accuracy Score (Post-Pruning):", accuracy_score(y_test, y3_pred))
plt.figure(figsize=(20, 10))
plot_tree(best_model, feature_names=X.columns, class_names=['Not Survived', 'Survived'], filled=True)
plt.title("Decision Tree with Post-Pruning")
plt.show()

# Classification reports
print("\nClassification Report (No Pruning):\n", classification_report(y_test, y1_pred))
print("\nClassification Report (Pre-Pruning):\n", classification_report(y_test, y2_pred))
print("\nClassification Report (Post-Pruning):\n", classification_report(y_test, y3_pred))

# Confusion Matrices
ConfusionMatrixDisplay.from_predictions(y_test, y1_pred, cmap='Blues')
plt.title("Confusion Matrix (No Pruning)")
plt.show()

ConfusionMatrixDisplay.from_predictions(y_test, y2_pred, cmap='Purples')
plt.title("Confusion Matrix (Pre-Pruning)")
plt.show()

ConfusionMatrixDisplay.from_predictions(y_test, y3_pred, cmap='Reds')
plt.title("Confusion Matrix (Post-Pruning)")
plt.show()


** Naive Bayes **

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

# Load dataset
df = pd.read_csv('C:/Users/ADMIN/Desktop/ML/adult.csv')

# Clean data
df.replace(' ?', np.nan, inplace=True)
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

# Encode categorical features
le = LabelEncoder()
for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = le.fit_transform(df[column])

# Split features and target
X = df.drop('income', axis=1)
y = df['income']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Naive Bayes model
model = GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred_train = model.predict(X_train)

# Evaluation
print("Accuracy Score (Test):", accuracy_score(y_test, y_pred))
print("Accuracy Score (Train):", accuracy_score(y_train, y_pred_train))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
labels = ['<=50K', '>50K']
plt.figure(figsize=(8, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

ConfusionMatrixDisplay.from_predictions(y_test, y_pred, cmap='Reds')
plt.title("Confusion Matrix")
plt.show()


K-**Means**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Set Seaborn style
sns.set(style="whitegrid")

# Load the dataset
df = pd.read_csv('C:/Users/ADMIN/Desktop/ML/Mall_Customers.csv')

# Data overview
print("First few rows of the dataset:")
print(df.head())

print("\nDataset Info:")
print(df.info())

print("\nSummary Statistics:")
print(df.describe())

print("\nMissing Values in each column:")
print(df.isnull().sum())

# Visualize feature distributions
plt.figure(figsize=(10, 6))
sns.histplot(data=df, kde=True)
plt.title("Feature Distributions")
plt.show()

# Encode 'Genre' column (Male/Female)
label_encoder = LabelEncoder()
df['Genre'] = label_encoder.fit_transform(df['Genre'])

# Select features for clustering
X = df[['Annual Income (k$)', 'Spending Score (1-100)']]

# Scatter plot of raw data
plt.figure(figsize=(8, 5))
plt.scatter(X.iloc[:, 0], X.iloc[:, 1], s=50)
plt.title("Raw Data Points (Unlabeled)")
plt.xlabel("Annual Income (k$)")
plt.ylabel("Spending Score (1-100)")
plt.show()

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Elbow method to find optimal K
inertia = []
K_range = range(1, 11)
for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

plt.figure(figsize=(8, 5))
plt.plot(K_range, inertia, 'bo-')
plt.title("Elbow Method For Optimal K")
plt.xlabel("Number of Clusters (K)")
plt.ylabel("Inertia (Within-Cluster Sum of Squares)")
plt.xticks(K_range)
plt.show()

# Optimal number of clusters selected based on elbow method
optimal_k = 5  # because after k=5, there's no significant drop in inertia

# Apply KMeans
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
y_kmeans = kmeans.fit_predict(X_scaled)

# Visualize Clusters
plt.figure(figsize=(8, 5))
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=y_kmeans, cmap='viridis', s=50)
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='red', s=200, alpha=0.75, marker='X', label='Centroids')
plt.title("K-Means Clustering Result")
plt.xlabel("Annual Income (scaled)")
plt.ylabel("Spending Score (scaled)")
plt.legend()
plt.show()

# Results
print("Cluster Centers (in scaled space):")
print(centers)

print("\nFirst 10 Predicted Cluster Labels:")
print(y_kmeans[:10])


**ANN**

In [None]:
# 1. Import Required Libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from google.colab import files

# 2. Upload and Load Dataset
uploaded = files.upload()
data = pd.read_csv('Churn_Modelling.csv')

# 3. Initial Data Exploration
print(data.head())
print("\nData Info:")
print(data.info())
print("\nStatistical Summary:")
print(data.describe())
print("\nMissing Values Check:")
print(data.isnull().sum())

# 4. Feature Selection
X = data.iloc[:, 3:-1]  # Exclude RowNumber, CustomerId, Surname, and Exited (target)
y = data.iloc[:, -1]    # Exited column is the target

# 5. Encode Categorical Variables
le = LabelEncoder()
X['Gender'] = le.fit_transform(X['Gender'])  # Convert Male/Female to 1/0
X = pd.get_dummies(X, columns=['Geography'], drop_first=True)  # One-hot encoding for Geography

# 6. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# 7. Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 8. Build the ANN
ann = tf.keras.models.Sequential()
ann.add(tf.keras.layers.Dense(units=6, activation='relu', input_shape=(X_train.shape[1],)))
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))
ann.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))  # Output layer for binary classification

# 9. Compile the ANN
ann.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 10. Train the ANN
history = ann.fit(X_train, y_train, batch_size=32, epochs=50, validation_data=(X_test, y_test))

# 11. Make Predictions
y_pred = ann.predict(X_test)
y_pred = (y_pred > 0.5)  # Convert probabilities to 0 or 1

# 12. Evaluate the Model
print("\nAccuracy Score:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


 Bagging vs **Boosting**

In [None]:
# 1. Import Required Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# 2. Load the Data
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = pd.Series(iris.target, name='species')

print("\n🔍 Dataset Preview:")
print(X.head())

print("\n📊 Data Info:")
print(X.info())

print("\n📈 Class Distribution:")
print(y.value_counts())

# 3. Split Data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

# 4. Bagging - Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# 5. Boosting - AdaBoost
ada_model = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=1),
    n_estimators=50, random_state=42
)
ada_model.fit(X_train, y_train)
y_pred_ada = ada_model.predict(X_test)

# 6. Evaluation

# 📊 Random Forest (Bagging)
print("\n🌲 Random Forest (Bagging) Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

# Confusion Matrix for Random Forest
cm_rf = confusion_matrix(y_test, y_pred_rf)
plt.figure(figsize=(6, 4))
sns.heatmap(cm_rf, annot=True, fmt='d', cmap="Blues",
            xticklabels=iris.target_names, yticklabels=iris.target_names)
plt.title("Random Forest - Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# ⚡ AdaBoost (Boosting)
print("\n⚡ AdaBoost (Boosting) Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_ada))
print(classification_report(y_test, y_pred_ada))

# Confusion Matrix for AdaBoost
cm_ada = confusion_matrix(y_test, y_pred_ada)
plt.figure(figsize=(6, 4))
sns.heatmap(cm_ada, annot=True, fmt='d', cmap="Greens",
            xticklabels=iris.target_names, yticklabels=iris.target_names)
plt.title("AdaBoost - Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# 7. Conclusion
print("\n🔚 Conclusion:")
print(f"✅ Random Forest Accuracy: {accuracy_score(y_test, y_pred_rf) * 100:.2f}%")
print(f"✅ AdaBoost Accuracy     : {accuracy_score(y_test, y_pred_ada) * 100:.2f}%")

if accuracy_score(y_test, y_pred_rf) > accuracy_score(y_test, y_pred_ada):
    print("🎯 Bagging (Random Forest) performed slightly better on the Iris dataset.")
else:
    print("🎯 Boosting (AdaBoost) performed slightly better on the Iris dataset.")


Random Forest

In [None]:
# 1. Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# 2. Load Dataset
# Replace 'your_file.csv' with your actual file
data = pd.read_csv('your_file.csv')  # Example: 'Churn_Modelling.csv'
print("\n📋 Dataset Head:")
print(data.head())

# 3. EDA - Check for missing values
print("\n🔍 Data Info:")
print(data.info())

print("\n🧼 Missing Values:")
print(data.isnull().sum())

# 4. Define Features (X) and Target (y)
# Update based on your dataset
X = data.drop('target_column_name', axis=1)  # Replace with actual column name
y = data['target_column_name']  # Replace with actual column name

# 5. Handle Categorical Features
# Convert object or category columns to numeric
for col in X.select_dtypes(include=['object', 'category']).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])

# 6. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

# 7. Feature Scaling (Optional for tree-based models, but useful if combining with other models)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 8. Train Random Forest
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 9. Predict
y_pred = model.predict(X_test)

# 10. Evaluate Model
print("\n✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📉 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n📋 Classification Report:\n", classification_report(y_test, y_pred))

# 11. Feature Importance (Optional)
importances = model.feature_importances_
feature_names = X.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title('Feature Importances from Random Forest')
plt.show()
