In [170]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score

##importing data

In [171]:
import pandas as pd
disease = pd.read_csv("data.csv")
disease.head()

Unnamed: 0,Diagnosis,radius,texture,perimeter,area,smoothness,compactness,concavity,concave_points,symmetry,...,radius.2,texture.2,perimeter.2,area.2,smoothness.2,compactness.2,concavity.2,concave_points.2,symmetry.2,fractal_dimension.2
0,M,17,99,10,38,122,8,1001,0,1184,...,95,0,9053,8,589,153,4,0,6399,0
1,M,20,57,17,77,132,9,1326,0,8474,...,5435,0,7339,3,398,74,8,0,5225,0
2,M,19,69,21,25,130,1203,0,1096,0,...,0,7869,4,585,94,3,0,615,0,4006
3,M,11,42,20,38,77,58,386,1,0,...,0,4956,1,156,3,445,27,23,0,911
4,M,20,29,14,34,135,1,1297,0,1003,...,7572,0,7813,5,438,94,44,0,1149,0


##checking whether the data is balanced or not

In [172]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
disease_counts = disease["Diagnosis"].value_counts()
temp_df = pd.DataFrame({
    "disease": disease_counts.index,
    "counts": disease_counts.values
})
#plt.figure(figsize =(18,8))
#sns.barplot(x="disease", y="counts", data=temp_df)
#plt.xticks(rotation=90)
#plt.show()

##encoding label

In [173]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
disease['Diagnosis'] = label_encoder.fit_transform(disease['Diagnosis'])
#corr_matrix = disease.corr()
#corr_matrix['Diagnosis'].sort_values(ascending=False)

##train_test_splitting

In [174]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(disease, test_size=0.2, random_state=42)
print(f" Rows in train set: {len(train_set)}\n Rows in test set: {len(test_set)}\n")
disease = train_set.copy()

 Rows in train set: 455
 Rows in test set: 114



In [200]:
#print(train_set.shape[1])  # Features in the training data
#print(test_set.shape[1])

In [199]:
disease = train_set.drop("Diagnosis", axis=1)
diagnosis = train_set["Diagnosis"].copy()
#print(disease.shape[1])  # Features in the training data


##creating a pipeline

In [176]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
my_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('scalar', StandardScaler())
])
disease_tr = my_pipe.fit_transform(disease)

##model generation

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

#defining scoring metric for k-fold cross validation
def cv_scoring(estimator, disease_tr,diagnosis):
  return accuracy_score(diagnosis, estimator.predict(disease_tr))


#selecting model
models={
    "logistic_regression": LogisticRegression(max_iter=1000,class_weight='balanced'),
    "decision_tree": DecisionTreeClassifier(class_weight='balanced',),
    "random_forest": RandomForestClassifier(class_weight='balanced', random_state=18)    
}

for i in range(len(list(models))):
  model = list(models.values())[i]
  model.fit(disease_tr,diagnosis)

#making predictions
y_train_predict = model.predict(disease_tr)
print(list(models.keys())[i])
scores(diagnosis, y_train_predict)


logistic_regression
model performance for training set: 
accuracy: 0.9231
f1 score: 0.9232
precision: 0.8895
recall: 0.9053
roc_auc_score: 0.9194
decision_tree
model performance for training set: 
accuracy: 1.0000
f1 score: 1.0000
precision: 1.0000
recall: 1.0000
roc_auc_score: 1.0000
random_forest
model performance for training set: 
accuracy: 1.0000
f1 score: 1.0000
precision: 1.0000
recall: 1.0000
roc_auc_score: 1.0000


In [214]:
for i in range(len(list(models))):
  model = list(models.values())[i]
  cross_val_score(model, disease_tr, diagnosis, cv=10, n_jobs=-1, scoring=cv_scoring())
  print("=="*30)
  print(model[i])
  print(f"scores: {scores}")
  print(f"mean scores: {np.mean(scores)}")

NameError: name 'cv_scoring' is not defined

##evaluation score function

In [179]:
def scores(ac_value, pred_value):
    #evaluating training model
    model_train_accuracy = accuracy_score(ac_value, pred_value)
    model_train_f1 = f1_score(ac_value, pred_value, average='weighted')
    model_train_precision = precision_score(ac_value, pred_value, zero_division=0)
    model_train_recall = recall_score(ac_value, pred_value)
    model_train_rocauc_score = roc_auc_score(ac_value, pred_value)

    print("model performance for training set: ")
    print("accuracy: {:.4f}".format(model_train_accuracy))
    print("f1 score: {:.4f}".format(model_train_f1))
    print("precision: {:.4f}".format(model_train_precision))
    print("recall: {:.4f}".format(model_train_recall))
    print("roc_auc_score: {:.4f}".format(model_train_rocauc_score))

##testing data

In [205]:
x_test = test_set.drop("Diagnosis", axis=1)
y_test = test_set["Diagnosis"].copy()
x_test_prepared = my_pipe.transform(x_test) 
for i in range(len(list(models))):
 model = list(models.values())[i]
 final_predictions = model.predict(x_test_prepared)
 print(list(models.keys())[i])
 scores(y_test, final_predictions)



logistic_regression
model performance for training set: 
accuracy: 0.9474
f1 score: 0.9474
precision: 0.9302
recall: 0.9302
roc_auc_score: 0.9440
decision_tree
model performance for training set: 
accuracy: 1.0000
f1 score: 1.0000
precision: 1.0000
recall: 1.0000
roc_auc_score: 1.0000
random_forest
model performance for training set: 
accuracy: 1.0000
f1 score: 1.0000
precision: 1.0000
recall: 1.0000
roc_auc_score: 1.0000


In [204]:
print(final_predictions)

[0 1 1 0 0 1 1 1 0 0 0 1 0 1 0 1 0 0 0 1 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1
 0 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 0 1 1 0 0 0 1 1 0 0 1 1 0 1
 0 0 0 1 0 0 1 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 1 1 0 1 1 0 1 1 0 0 0 1 0 0 1
 0 0 1]


In [211]:
print(list(y_test))

[0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1]
