<a href="https://colab.research.google.com/github/sarveshgadkari/ML/blob/main/Untitled9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Linear
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# --------------------------
# STEP 1: Load Data
# --------------------------
file_path = input("Enter the path to your CSV file: ")
df = pd.read_csv(file_path)
print("\n✅ Dataset Loaded Successfully!")

# --------------------------
# STEP 2: Basic EDA
# --------------------------
print("\n--- Dataset Info ---")
print(df.info())

print("\n--- First 5 Rows ---")
print(df.head())

print("\n--- Summary Statistics ---")
print(df.describe())

print("\n--- Missing Values ---")
print(df.isnull().sum())

print("\n--- Data Types ---")
print(df.dtypes)

# --------------------------
# STEP 3: Visual EDA
# --------------------------
print("\n📊 Generating EDA plots...")
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()

# Histograms
df[numerical_cols].hist(figsize=(15, 10), bins=20)
plt.suptitle("Histograms of Numerical Features")
plt.show()

# Correlation Heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(df[numerical_cols].corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

# --------------------------
# STEP 4: Preprocessing
# --------------------------
# Encode categorical variables
cat_cols = df.select_dtypes(include='object').columns
le = LabelEncoder()
for col in cat_cols:
    if df[col].nunique() == 2:
        df[col] = le.fit_transform(df[col])  # Binary encoding
    else:
        df = pd.get_dummies(df, columns=[col], drop_first=True)  # One-hot encoding

# Drop rows with missing values (optional: imputation)
df.dropna(inplace=True)

# --------------------------
# STEP 5: Feature Selection
# --------------------------
target_col = input("\nEnter the name of the target column: ")
X = df.drop(columns=[target_col])
y = df[target_col]

# Optional: Scaling
scaler = StandardScaler()
X = scaler.fit_transform(X)

# --------------------------
# STEP 6: Train-Test Split
# --------------------------
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --------------------------
# STEP 7: Train Model
# --------------------------
model = LinearRegression()
model.fit(x_train, y_train)

# --------------------------
# STEP 8: Predict & Evaluate
# --------------------------
y_pred = model.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("\n📈 Model Evaluation:")
print("Mean Squared Error:", mse)
print("R-squared Score:", r2)

# --------------------------
# STEP 9: Results Visualization
# --------------------------
result = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print("\n--- Prediction Results ---")
print(result.head())

# Scatter Plot
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.6, color='teal')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted Values')
plt.grid(True)
plt.show()


In [None]:
#DecisionTree
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay
import warnings
warnings.filterwarnings('ignore')

# --------------------------
# STEP 1: Load Data
# --------------------------
file_path = input("Enter the path to your CSV file: ")
df = pd.read_csv(file_path)
print("\n✅ Dataset Loaded Successfully!")

# --------------------------
# STEP 2: Basic EDA
# --------------------------
print("\n--- Dataset Info ---")
print(df.info())

print("\n--- First 5 Rows ---")
print(df.head())

print("\n--- Summary Statistics ---")
print(df.describe())

print("\n--- Missing Values ---")
print(df.isnull().sum())

print("\n--- Data Types ---")
print(df.dtypes)

# --------------------------
# STEP 3: Visual EDA
# --------------------------
print("\n📊 Generating EDA plots...")
categorical_cols = df.select_dtypes(include='object').columns
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()

# Histograms for numerical features
df[numerical_cols].hist(figsize=(15, 10), bins=20)
plt.suptitle("Histograms of Numerical Features")
plt.show()

# Correlation heatmap for numerical features
plt.figure(figsize=(12, 8))
sns.heatmap(df[numerical_cols].corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

# --------------------------
# STEP 4: Preprocessing
# --------------------------
# Encode categorical variables (handle binary and multi-class categories)
df = pd.get_dummies(df, drop_first=True)  # One-hot encoding all categorical columns

# Drop missing values (or handle with imputation)
df.dropna(inplace=True)

# --------------------------
# STEP 5: Feature Selection
# --------------------------
target_col = input("\nEnter the name of the target column: ")
X = df.drop(columns=[target_col])
y = df[target_col]

# --------------------------
# STEP 6: Train-Test Split
# --------------------------
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --------------------------
# STEP 7: Train Decision Tree Models
# --------------------------

# 1. No Pruning
model_1 = DecisionTreeClassifier(random_state=42)
model_1.fit(x_train, y_train)
y_pred_1 = model_1.predict(x_test)
print("\nAccuracy (No Pruning):", accuracy_score(y_test, y_pred_1))
plt.figure(figsize=(20, 10))
plot_tree(model_1, feature_names=X.columns, class_names=[str(i) for i in y.unique()], filled=True)
plt.title("Decision Tree (No Pruning)")
plt.show()

# 2. Pre-Pruning
model_2 = DecisionTreeClassifier(max_depth=5, min_samples_split=10, random_state=42)
model_2.fit(x_train, y_train)
y_pred_2 = model_2.predict(x_test)
print("\nAccuracy (Pre-Pruning):", accuracy_score(y_test, y_pred_2))
plt.figure(figsize=(20, 10))
plot_tree(model_2, feature_names=X.columns, class_names=[str(i) for i in y.unique()], filled=True)
plt.title("Decision Tree (Pre-Pruning)")
plt.show()

# 3. Post-Pruning
model_3 = DecisionTreeClassifier(random_state=42)
model_3.fit(x_train, y_train)
path = model_3.cost_complexity_pruning_path(x_train, y_train)
ccp_alphas = path.ccp_alphas
models = [DecisionTreeClassifier(random_state=42, ccp_alpha=alpha).fit(x_train, y_train) for alpha in ccp_alphas]
accuracies = [accuracy_score(y_test, model.predict(x_test)) for model in models]
best_alpha = ccp_alphas[np.argmax(accuracies)]
best_model = DecisionTreeClassifier(random_state=42, ccp_alpha=best_alpha)
best_model.fit(x_train, y_train)
y_pred_3 = best_model.predict(x_test)
print("\nAccuracy (Post-Pruning):", accuracy_score(y_test, y_pred_3))
plt.figure(figsize=(20, 10))
plot_tree(best_model, feature_names=X.columns, class_names=[str(i) for i in y.unique()], filled=True)
plt.title("Decision Tree (Post-Pruning)")
plt.show()

# --------------------------
# STEP 8: Classification Report and Confusion Matrix
# --------------------------

# Classification Reports
print("\nClassification Report (No Pruning):\n", classification_report(y_test, y_pred_1))
print("\nClassification Report (Pre-Pruning):\n", classification_report(y_test, y_pred_2))
print("\nClassification Report (Post-Pruning):\n", classification_report(y_test, y_pred_3))

# Confusion Matrices
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_1, cmap='Blues')
plt.title("Confusion Matrix (No Pruning)")
plt.show()

ConfusionMatrixDisplay.from_predictions(y_test, y_pred_2, cmap='Purples')
plt.title("Confusion Matrix (Pre-Pruning)")
plt.show()

ConfusionMatrixDisplay.from_predictions(y_test, y_pred_3, cmap='Reds')
plt.title("Confusion Matrix (Post-Pruning)")
plt.show()


In [None]:

#Naive Bayes


import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.utils import resample

# Load dataset
df = pd.read_csv('C:/Users/ADMIN/Desktop/ML/adult.csv')

# Clean data: Handle '?' as NaN and drop missing rows
df.replace(' ?', np.nan, inplace=True)
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

# Encode categorical features
le = LabelEncoder()
for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = le.fit_transform(df[column])

# Split features and target
X = df.drop('income', axis=1)
y = df['income']

# Handle class imbalance (if needed)
# Upsample minority class
X_upsampled, y_upsampled = resample(X[y == 0], y[y == 0],
                                    replace=True,
                                    n_samples=X[y == 1].shape[0],
                                    random_state=42)
X_balanced = np.vstack((X[y == 1], X_upsampled))
y_balanced = np.hstack((y[y == 1], y_upsampled))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

# Train Naive Bayes model
model = GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred_train = model.predict(X_train)

# Evaluation
print("Accuracy Score (Test):", accuracy_score(y_test, y_pred))
print("Accuracy Score (Train):", accuracy_score(y_train, y_pred_train))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
labels = ['<=50K', '>50K']
plt.figure(figsize=(8, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

# Alternative Confusion Matrix Display
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, cmap='Reds')
plt.title("Confusion Matrix")
plt.show()


In [None]:
#Kmean
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Set Seaborn style
sns.set(style="whitegrid")

# Load the dataset
df = pd.read_csv('C:/Users/ADMIN/Desktop/ML/Mall_Customers.csv')

# Data overview
print("First few rows of the dataset:")
print(df.head())

print("\nDataset Info:")
print(df.info())

print("\nSummary Statistics:")
print(df.describe())

print("\nMissing Values in each column:")
print(df.isnull().sum())

# Visualize feature distributions
plt.figure(figsize=(10, 6))
sns.histplot(data=df, kde=True)
plt.title("Feature Distributions")
plt.show()

# Encode 'Genre' column (Male/Female)
label_encoder = LabelEncoder()
df['Genre'] = label_encoder.fit_transform(df['Genre'])

# Select features for clustering
X = df[['Annual Income (k$)', 'Spending Score (1-100)']]

# Scatter plot of raw data
plt.figure(figsize=(8, 5))
plt.scatter(X.iloc[:, 0], X.iloc[:, 1], s=50)
plt.title("Raw Data Points (Unlabeled)")
plt.xlabel("Annual Income (k$)")
plt.ylabel("Spending Score (1-100)")
plt.show()

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Elbow method to find optimal K
inertia = []
K_range = range(1, 11)
for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

plt.figure(figsize=(8, 5))
plt.plot(K_range, inertia, 'bo-')
plt.title("Elbow Method For Optimal K")
plt.xlabel("Number of Clusters (K)")
plt.ylabel("Inertia (Within-Cluster Sum of Squares)")
plt.xticks(K_range)
plt.show()

# Optimal number of clusters selected based on elbow method
optimal_k = 5  # because after k=5, there's no significant drop in inertia

# Apply KMeans
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
y_kmeans = kmeans.fit_predict(X_scaled)

# Visualize Clusters
plt.figure(figsize=(8, 5))
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=y_kmeans, cmap='viridis', s=50)
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='red', s=200, alpha=0.75, marker='X', label='Centroids')
plt.title("K-Means Clustering Result")
plt.xlabel("Annual Income (scaled)")
plt.ylabel("Spending Score (scaled)")
plt.legend()
plt.show()

# Results
print("Cluster Centers (in scaled space):")
print(centers)

print("\nFirst 10 Predicted Cluster Labels:")
print(y_kmeans[:10])


In [None]:
#ANN
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Set Seaborn style
sns.set(style="whitegrid")

# Load the dataset
df = pd.read_csv('C:/Users/ADMIN/Desktop/ML/Mall_Customers.csv')

# Data overview
print("First few rows of the dataset:")
print(df.head())

print("\nDataset Info:")
print(df.info())

print("\nSummary Statistics:")
print(df.describe())

print("\nMissing Values in each column:")
print(df.isnull().sum())

# Visualize feature distributions
plt.figure(figsize=(10, 6))
sns.histplot(data=df, kde=True)
plt.title("Feature Distributions")
plt.show()

# Encode 'Genre' column (Male/Female)
label_encoder = LabelEncoder()
df['Genre'] = label_encoder.fit_transform(df['Genre'])

# Select features for clustering
X = df[['Annual Income (k$)', 'Spending Score (1-100)']]

# Scatter plot of raw data
plt.figure(figsize=(8, 5))
plt.scatter(X.iloc[:, 0], X.iloc[:, 1], s=50)
plt.title("Raw Data Points (Unlabeled)")
plt.xlabel("Annual Income (k$)")
plt.ylabel("Spending Score (1-100)")
plt.show()

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Elbow method to find optimal K
inertia = []
K_range = range(1, 11)
for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

plt.figure(figsize=(8, 5))
plt.plot(K_range, inertia, 'bo-')
plt.title("Elbow Method For Optimal K")
plt.xlabel("Number of Clusters (K)")
plt.ylabel("Inertia (Within-Cluster Sum of Squares)")
plt.xticks(K_range)
plt.show()

# Optimal number of clusters selected based on elbow method
optimal_k = 5  # because after k=5, there's no significant drop in inertia

# Apply KMeans
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
y_kmeans = kmeans.fit_predict(X_scaled)

# Visualize Clusters
plt.figure(figsize=(8, 5))
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=y_kmeans, cmap='viridis', s=50)
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='red', s=200, alpha=0.75, marker='X', label='Centroids')
plt.title("K-Means Clustering Result")
plt.xlabel("Annual Income (scaled)")
plt.ylabel("Spending Score (scaled)")
plt.legend()
plt.show()

# Results
print("Cluster Centers (in scaled space):")
print(centers)

print("\nFirst 10 Predicted Cluster Labels:")
print(y_kmeans[:10])


In [None]:
#Random Forest
# Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns

# Function to Perform EDA (for any dataset)
def perform_eda(df):
    print("First few rows of the dataset:")
    print(df.head())
    print("\nDataset Info:")
    print(df.info())
    print("\nSummary Statistics:")
    print(df.describe())
    print("\nMissing Values Check:")
    print(df.isnull().sum())
    print("\nData Distribution Visualization:")
    plt.figure(figsize=(10, 6))
    sns.histplot(df, kde=True)
    plt.show()

# Function to Preprocess Data (handle categorical and numerical data)
def preprocess_data(df):
    # Encode categorical columns using LabelEncoder
    label_encoder = LabelEncoder()
    categorical_columns = df.select_dtypes(include=['object']).columns
    for col in categorical_columns:
        df[col] = label_encoder.fit_transform(df[col])

    # Separate features and target
    X = df.drop(columns=['target'])  # Replace 'target' with your actual target column name
    y = df['target']  # Replace 'target' with your actual target column name

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Feature scaling (optional, depending on model)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    return X_train, X_test, y_train, y_test

# Function to Perform Random Forest with Hyperparameter Tuning
def random_forest_with_tuning(X_train, X_test, y_train, y_test):
    # Hyperparameter grid for Random Forest
    param_dist = {
        'n_estimators': [50, 100, 150, 200],
        'max_features': ['auto', 'sqrt', 'log2'],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False]
    }

    # Initialize RandomForestClassifier
    rf = RandomForestClassifier(random_state=42)

    # Randomized Search with Cross-Validation (StratifiedKFold)
    random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist,
                                       n_iter=100, cv=StratifiedKFold(5),
                                       verbose=2, random_state=42, n_jobs=-1)

    # Fit the model
    random_search.fit(X_train, y_train)

    # Best hyperparameters
    print("Best Hyperparameters Found: ", random_search.best_params_)

    # Make predictions
    y_pred = random_search.best_estimator_.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print("\nAccuracy:", accuracy)
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

# Main function to execute
def main():
    # Load dataset (Replace with the path to your dataset)
    df = pd.read_csv('C:/path/to/your/dataset.csv')

    # Perform EDA
    perform_eda(df)

    # Preprocess data
    X_train, X_test, y_train, y_test = preprocess_data(df)

    # Perform Random Forest with Hyperparameter Tuning
    random_forest_with_tuning(X_train, X_test, y_train, y_test)

# Run the main function
if __name__ == "__main__":
    main()
