# Scikit learn

### Modeling by Scikit-Learn

-   Regression Models

    \## **1️⃣ Linear Regression (California Housing Dataset)**

    💡 **Used for Predicting House Prices**

    ``` python
    import numpy as np
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LinearRegression
    from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
    from sklearn.datasets import fetch_california_housing
    from sklearn.preprocessing import StandardScaler

    # Load dataset
    data = fetch_california_housing(as_frame=True)
    df = data.frame

    # Define features and target
    X = df.drop(columns=["MedHouseVal"])
    y = df["MedHouseVal"]

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Standardize features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Train Linear Regression model
    model = LinearRegression()
    model.fit(X_train_scaled, y_train)

    # Predictions
    y_pred = model.predict(X_test_scaled)

    # Evaluation
    print("📊 Linear Regression Model Evaluation:")
    print("MAE:", mean_absolute_error(y_test, y_pred))
    print("MSE:", mean_squared_error(y_test, y_pred))
    print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
    print("R² Score:", r2_score(y_test, y_pred))
    ```

    💡 **Used self data**

    ``` python
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LinearRegression
    from sklearn.metrics import mean_squared_error, r2_score

    # Synthetic dataset (fixed indentation)
    X_reg = np.array([[1], [2], [3], [4], [5]])
    y_reg = np.array([1, 3, 2, 3, 5])

    # Split data
    X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

    # Train model
    reg_model = LinearRegression()
    reg_model.fit(X_train_r, y_train_r)

    # Make predictions
    y_pred_r = reg_model.predict(X_test_r)
    print("Linear Regression Predictions:", y_pred_r)

    # Evaluate model
    mse = mean_squared_error(y_test_r, y_pred_r)
    r2 = r2_score(y_test_r, y_pred_r)
    print(f"Mean Squared Error: {mse}")
    print(f"R^2 Score: {r2}")
    ```

    \### **Decision Tree Regression for California Housing**

    ``` python
    python
    CopyEdit
    import numpy as np
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
    from sklearn.datasets import fetch_california_housing

    # 📥 Load dataset
    data = fetch_california_housing(as_frame=True)
    df = data.frame

    # 🎯 Define features and target
    X = df.drop(columns=["MedHouseVal"])
    y = df["MedHouseVal"]

    # 🔀 Train-test split (80% train, 20% test)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # 🌳 Train Decision Tree Regression Model
    model = DecisionTreeRegressor(max_depth=5, random_state=42)
    model.fit(X_train, y_train)

    # 📈 Make Predictions
    y_pred = model.predict(X_test)

    # 📊 Model Evaluation
    print("📊 Decision Tree Regression Model Evaluation:")
    print("MAE:", mean_absolute_error(y_test, y_pred))
    print("MSE:", mean_squared_error(y_test, y_pred))
    print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
    print("R² Score:", r2_score(y_test, y_pred))
    ```

    \### **Decision Tree Regression with self data**

    ``` python
    python
    CopyEdit
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn.model_selection import train_test_split
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.metrics import mean_squared_error, r2_score

    # 📌 Synthetic dataset (Fixed indentation)
    X_reg = np.array([[1], [2], [3], [4], [5]])  # Feature
    y_reg = np.array([1, 3, 2, 3, 5])  # Target

    # 🔀 Split data (80% train, 20% test)
    X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

    # 🌳 Train Decision Tree Regression Model
    tree_model = DecisionTreeRegressor(max_depth=3, random_state=42)
    tree_model.fit(X_train_r, y_train_r)

    # 📈 Make Predictions
    y_pred_r = tree_model.predict(X_test_r)
    print("Decision Tree Predictions:", y_pred_r)

    # 📊 Evaluate model
    mse = mean_squared_error(y_test_r, y_pred_r)
    r2 = r2_score(y_test_r, y_pred_r)
    print(f"Mean Squared Error: {mse}")
    print(f"R^2 Score: {r2}")

    # 📊 Plot Decision Tree Predictions
    X_range = np.linspace(min(X_reg), max(X_reg), 100).reshape(-1, 1)
    y_range_pred = tree_model.predict(X_range)

    plt.scatter(X_reg, y_reg, color="blue", label="Actual Data")
    plt.plot(X_range, y_range_pred, color="red", linestyle="dashed", label="Decision Tree Prediction")
    plt.xlabel("Feature (X)")
    plt.ylabel("Target (y)")
    plt.title("Decision Tree Regression")
    plt.legend()
    plt.show()
    ```

-   Classification

    \### **Logistic Regression Classifier**

    ``` python
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import (
        accuracy_score, confusion_matrix, classification_report, roc_curve, auc
    )

    # 1️⃣ Load Dataset (Iris Dataset - Binary Classification)
    from sklearn.datasets import load_iris
    iris = load_iris()
    df = pd.DataFrame(iris.data, columns=iris.feature_names)
    df['target'] = (iris.target == 0).astype(int)  # Convert to Binary (Setosa vs Non-Setosa)

    # 2️⃣ Split Data into Train & Test
    X = df.drop(columns=['target'])
    y = df['target']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # 3️⃣ Standardization (Very Important for Logistic Regression!)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # 4️⃣ Train Logistic Regression Model
    model = LogisticRegression()
    model.fit(X_train_scaled, y_train)

    # 5️⃣ Make Predictions
    y_pred = model.predict(X_test_scaled)
    y_prob = model.predict_proba(X_test_scaled)[:, 1]  # Probability of class 1

    # 6️⃣ Model Evaluation
    accuracy = accuracy_score(y_test, y_pred)
    print(f"✅ Accuracy: {accuracy:.2f}")

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Non-Setosa", "Setosa"], yticklabels=["Non-Setosa", "Setosa"])
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.show()

    # Classification Report
    print("\n📌 Classification Report:\n", classification_report(y_test, y_pred, target_names=["Non-Setosa", "Setosa"]))

    # 7️⃣ ROC Curve
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(6, 4))
    plt.plot(fpr, tpr, color='blue', label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.legend(loc="lower right")
    plt.show()
    ```

    \## **Decision Tree Classifier (Titanic)**

    ``` python
    python
    CopyEdit
    import numpy as np
    import pandas as pd
    import seaborn as sns
    import matplotlib.pyplot as plt
    from sklearn.model_selection import train_test_split
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
    from sklearn.preprocessing import LabelEncoder

    # 📥 Load Titanic Dataset from Seaborn
    df = sns.load_dataset('titanic')

    # 🔍 Select Important Features
    df = df[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare']]

    # 🛠️ Handle Missing Values
    df['age'].fillna(df['age'].median(), inplace=True)

    # 🔄 Convert Categorical to Numeric (Sex: Male=1, Female=0)
    df['sex'] = LabelEncoder().fit_transform(df['sex'])

    # 🎯 Define Features (X) and Target (y)
    X = df.drop(columns=['survived'])
    y = df['survived']

    # 🔀 Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # 🌳 Train Decision Tree Classifier
    dt_model = DecisionTreeClassifier(max_depth=5, random_state=42)
    dt_model.fit(X_train, y_train)

    # 📈 Make Predictions
    y_pred = dt_model.predict(X_test)

    # 📊 Model Evaluation
    print("📊 Decision Tree Model Evaluation:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

    # 📉 Confusion Matrix
    sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", cmap="Blues")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix - Decision Tree")
    plt.show()
    ```

    \### **🌲 Random Forest Classifier**

    ``` python
    python
    CopyEdit
    import numpy as np
    import pandas as pd
    import seaborn as sns
    import matplotlib.pyplot as plt
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
    from sklearn.preprocessing import LabelEncoder

    # 📥 Load Titanic Dataset from Seaborn
    df = sns.load_dataset('titanic')

    # 🔍 Select Important Features
    df = df[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare']]

    # 🛠️ Handle Missing Values
    df['age'].fillna(df['age'].median(), inplace=True)

    # 🔄 Convert Categorical to Numeric (Sex: Male=1, Female=0)
    df['sex'] = LabelEncoder().fit_transform(df['sex'])

    # 🎯 Define Features (X) and Target (y)
    X = df.drop(columns=['survived'])
    y = df['survived']

    # 🔀 Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # 🌲 Train Random Forest Classifier
    rf_model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
    rf_model.fit(X_train, y_train)

    # 📈 Make Predictions
    y_pred_rf = rf_model.predict(X_test)

    # 📊 Model Evaluation
    print("📊 Random Forest Model Evaluation:")
    print("Accuracy:", accuracy_score(y_test, y_pred_rf))
    print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))

    # 📉 Confusion Matrix
    sns.heatmap(confusion_matrix(y_test, y_pred_rf), annot=True, fmt="d", cmap="Blues")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix - Random Forest")
    plt.show()
    ```

-   clustering

    \### **K-Means Clustering with Evaluation**

    ``` python
    python
    CopyEdit
    import numpy as np
    import pandas as pd
    import seaborn as sns
    import matplotlib.pyplot as plt
    from sklearn.cluster import KMeans
    from sklearn.preprocessing import StandardScaler
    from sklearn.metrics import silhouette_score, davies_bouldin_score
    from sklearn.datasets import load_iris

    # 📥 Load Iris Dataset
    iris = load_iris()
    df = pd.DataFrame(iris.data, columns=iris.feature_names)

    # 🔍 Standardize Data (K-Means is sensitive to scale)
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df)

    # 🔀 Train K-Means Model (Choosing 3 clusters since Iris has 3 species)
    kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
    df['Cluster'] = kmeans.fit_predict(df_scaled)

    # 📈 Evaluation Metrics
    inertia = kmeans.inertia_  # WCSS
    silhouette = silhouette_score(df_scaled, df['Cluster'])
    davies_bouldin = davies_bouldin_score(df_scaled, df['Cluster'])

    print("📊 K-Means Clustering Evaluation Metrics:")
    print(f"🔹 Inertia (WCSS): {inertia:.2f}")
    print(f"🔹 Silhouette Score: {silhouette:.4f} (Closer to 1 is better)")
    print(f"🔹 Davies-Bouldin Score: {davies_bouldin:.4f} (Lower is better)")

    # 📊 Visualizing Clusters
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=df.iloc[:, 0], y=df.iloc[:, 1], hue=df['Cluster'], palette='viridis')
    plt.title("K-Means Clustering on Iris Dataset")
    plt.xlabel(iris.feature_names[0])
    plt.ylabel(iris.feature_names[1])
    plt.legend(title="Cluster")
    plt.show()
    ```