In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

In [None]:
#email spam
data = pd.read_csv("D:/ai workforce/22 august practice/email_spam.csv")


X = data[["word_free", "word_offer", "word_click", "num_links", "num_caps", "sender_reputation"]]
y = data["is_spam"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


model = LogisticRegression()
model.fit(X_train, y_train)


y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]


print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("roc-auc:", roc_auc_score(y_test, y_prob))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.475
Precision: 0.2
Recall: 0.05555555555555555
F1: 0.08695652173913043
ROC-AUC: 0.31313131313131315
Confusion Matrix:
 [[18  4]
 [17  1]]


In [None]:

#customer churn 
data = pd.read_csv("D:/ai workforce/22 august practice/customer_churn.csv")


X = data[["tenure_months", "monthly_charges", "support_tickets", "is_premium", "avg_usage_hours"]]
y = data["churn"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


s = StandardScaler()
X_train = s.fit_transform(X_train)
X_test = s.transform(X_test)


m = LogisticRegression()
m.fit(X_train, y_train)

y_pred = m.predict(X_test)
y_prob = m.predict_proba(X_test)[:, 1]


print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("roc-auc:", roc_auc_score(y_test, y_prob))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.525
Precision: 0.5416666666666666
Recall: 0.6190476190476191
F1: 0.5777777777777777
roc-auc: 0.4736842105263158
Confusion Matrix:
 [[ 8 11]
 [ 8 13]]


In [None]:
#disease stage 
from sklearn.metrics import classification_report

data = pd.read_csv("D:/ai workforce/22 august practice/disease_stage.csv")

X = data[["age", "b1", "b2", "b3", "b4"]]
y = data["stage"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

m = LogisticRegression(multi_class="multinomial", max_iter=1000)
m.fit(X_train, y_train)

y_pred = m.predict(X_test)


print("Accuracy:", accuracy_score(y_test, y_pred))
print("Macro-F1:", f1_score(y_test, y_pred, average="macro"))
print("Weighted-F1:", f1_score(y_test, y_pred, average="weighted"))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.325
Macro-F1: 0.2833333333333333
Weighted-F1: 0.29500000000000004
Confusion Matrix:
 [[9 3 3]
 [6 1 6]
 [6 3 3]]
Classification Report:
               precision    recall  f1-score   support

           0       0.43      0.60      0.50        15
           1       0.14      0.08      0.10        13
           2       0.25      0.25      0.25        12

    accuracy                           0.33        40
   macro avg       0.27      0.31      0.28        40
weighted avg       0.28      0.33      0.30        40





In [22]:

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix



df = pd.read_csv("flowers.csv")


X = df[["sepal_length", "sepal_width", "petal_length", "petal_width"]]
y = df["species"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


k_list = list(range(1, 26, 2))  # odd numbers 1,3,5,...25
cv_scores = []

for k in k_list:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train, y_train, cv=5)
    cv_scores.append(scores.mean())

best_k = k_list[np.argmax(cv_scores)]


knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X_train, y_train)


y_pred = knn.predict(X_test)


print("Best k:", best_k)
print("CV Accuracy:", max(cv_scores))
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Best k: 17
CV Accuracy: 0.4125
Test Accuracy: 0.325
Confusion Matrix:
 [[7 3 4]
 [7 1 5]
 [6 2 5]]


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

df = pd.read_csv("airbnb.csv")


X = df[["size_m2", "distance_center_km", "rating", "num_reviews"]]
y = df["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


k_list = list(range(1, 26, 2))
cv_rmse = []

for k in k_list:
    knn = KNeighborsRegressor(n_neighbors=k)
    scores = cross_val_score(knn, X_train, y_train, cv=5,
                             scoring="neg_root_mean_squared_error")
    cv_rmse.append(-scores.mean())  # make positive RMSE


best_k = k_list[np.argmin(cv_rmse)]


knn = KNeighborsRegressor(n_neighbors=best_k)
knn.fit(X_train, y_train)


y_pred = knn.predict(X_test)


print("Best k:", best_k)
print("CV RMSE:", min(cv_rmse))
print("Test RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("Test R²:", r2_score(y_test, y_pred))


Best k: 25
CV RMSE: 137.69442189231899
Test RMSE: 129.06188111425982
Test R²: -0.16741187580272143
