### Week 8 Assignment  
**Name:** Sarah Lamond / sc8989  

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

glass = pd.read_csv('glass.csv')
glass.head()
glass.Type.value_counts().sort_index()
glass['household'] = glass.Type.map({1:0, 2:0, 3:0, 5:1, 6:1, 7:1})
glass.household.value_counts()

In [None]:
glass.sort_values( by = 'Al', inplace=True)
X= np.array(glass.Al).reshape(-1,1)
y = glass.household


**Q1 . Try different thresholds for computing predictions using 'Al' column. By default it is 0.5. Use predict_proba function to compute probabilities and then try custom thresholds and see their impact on Accuracy, Precision and Recall.**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)
probs = model.predict_proba(X_test)[:, 1]

thresholds = [0.3, 0.4, 0.5, 0.6, 0.7]

for t in thresholds:
    preds = (probs >= t).astype(int)
    acc = accuracy_score(y_test, preds)
    prec = precision_score(y_test, preds)
    rec = recall_score(y_test, preds)
    print(f"Threshold: {t:.2f}, Accuracy: {acc:.2f}, Precision: {prec:.2f}, Recall: {rec:.2f}")

**Q2. Do the same analysis for other columns**

In [None]:
elements = ['RI', 'Na', 'Mg', 'Si', 'K', 'Ca', 'Ba', 'Fe']
thresholds = [0.3, 0.4, 0.5, 0.6, 0.7]

for ele in elements:
    X = glass[[ele]].values
    y = glass['household']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    model = LogisticRegression()
    model.fit(X_train, y_train)
    probs = model.predict_proba(X_test)[:, 1]
    
    for threshold in thresholds:
        preds = (probs >= threshold).astype(int)
        acc = accuracy_score(y_test, preds)
        prec = precision_score(y_test, preds)
        rec = recall_score(y_test, preds)
        print(f"Element: {ele}, Threshold: {threshold:.2f}, Accuracy: {acc:.2f}, Precision: {prec:.2f}, Recall: {rec:.2f}")

**Q3. Fit a Logistic Regression Model on all features. Remember to preprocess data(eg. normalization and one hot encoding).**

In [None]:
X = glass.drop(['Type', 'household'], axis=1)
y = glass['household']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

num_cols = X.select_dtypes(include='float64').columns
cat_cols = X.select_dtypes(include='object').columns

num_pipeline = Pipeline([('scaler', StandardScaler())])
cat_pipeline = Pipeline([('onehot', OneHotEncoder())])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

model = Pipeline([
    ('prep', preprocessor),
    ('lr', LogisticRegression())
])

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))