# **Importing libraries**

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# **Loading data**

In [None]:
data = pd.read_csv('/kaggle/input/breastcancerdataset/BRCA.csv')
data = data.dropna()
data = data.drop([data.columns[i] for i in range(8,15)], axis=1)
data = data.drop('Patient_ID', axis=1)

# **Data preprocessing**

In [None]:
le = LabelEncoder()
# Transform categorical data
data['Patient_Status'] = le.fit_transform(data['Patient_Status']) 
data['Gender'] = le.fit_transform(data['Gender'].astype(str))
X,Y = data.iloc[:,:-1], data.iloc[:,-1] # Extract features and labels
X_dummies = X.copy() # Copy of the features with "dummied" categorical data for one kind of feature
X_dummies = pd.get_dummies(X_dummies, prefix=['Tumour_Stage']) # Get dummies for a categorical data
X['Tumour_Stage'] = le.fit_transform(X['Tumour_Stage'].astype(str)) # Transform categorical data
# Split into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.151, random_state=42)
X_dummies_train, X_dummies_test, Y_dummies_train, Y_dummies_test = train_test_split(X_dummies, Y, test_size=0.151, random_state=42)

# **Training the model**

In [None]:
## Train models
log_reg = LogisticRegression(max_iter = 7777)
log_reg.fit(X_train, Y_train)
tree = DecisionTreeClassifier()
tree.fit(X_train, Y_train)

## Train models with dummies
log_reg_dummies = LogisticRegression(max_iter = 7777)
log_reg_dummies.fit(X_dummies_train, Y_dummies_train)
tree_dummies = DecisionTreeClassifier()
tree_dummies.fit(X_dummies_train, Y_dummies_train)

# **Evaluate the models**

In [None]:
## Evalueate models
log_reg_acc = 100*log_reg.score(X_test, Y_test)
tree_acc = 100*tree.score(X_test, Y_test)
log_reg_acc_dummies = 100*log_reg_dummies.score(X_dummies_test, Y_dummies_test)
tree_acc_dummies = 100*tree_dummies.score(X_dummies_test, Y_dummies_test)

print("Logistic Regression: {:.4f}%".format(log_reg_acc))
print("Decision Tree Classifier: {:.4f}%".format(tree_acc))
print("Logistic Regression with dummies: {:.4f}%".format(log_reg_acc_dummies))
print("Decision Tree Classifier with dummies: {:.4f}%".format(tree_acc_dummies))

# **Conclusion**
As far as my knowledge and experience in ML is spreading - this dataset is a candidate for regression classifications as I can see. I used only 2 kind of model algorithms to predict the desired label from the data and as much as my experience, as I said earlier, in ML, data science and statistics is spreading - I achieved 83% for the Logistic Regression model I used, among the others. Concerns, corrections, criticism, tips and tricks, leave them all in the comments.