# Importing Libraries

In [None]:
# for operation in data 
import pandas as pd 
import numpy as np 

# for visualization 
import matplotlib.pyplot as plt 
import seaborn as sns 

# for preprocessing 
from sklearn.preprocessing import StandardScaler,MinMaxScaler,LabelEncoder,OneHotEncoder
from sklearn.impute import SimpleImputer

# to split data 
from sklearn.model_selection import train_test_split, KFold 

# TO Evaluation 
from sklearn.metrics import accuracy_score , precision_score ,recall_score , f1_score
from sklearn.metrics import confusion_matrix , classification_report 


# classification models 
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression



# change setting of notebook 
pd.options.display.max_columns=None
pd.options.display.max_rows=None

sns.set_style("darkgrid")
sns.set_palette("coolwarm")

# Data loading

In [None]:
df=pd.read_csv('/kaggle/input/breast-cancer/Breast_Cancer.csv')
df.head()

# EDA

### Understanding the data

In [None]:
df.info()

### Check NAN values 

In [None]:
df.isna().sum()

### Check & Drop  duplicates

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)
df.reset_index(inplace=True , drop=True)


# Describe

In [None]:
df.describe()

In [None]:
plt.figure(figsize=(10,10))
for col in df.select_dtypes('int'):
    sns.displot(df[col],kde=True)
    plt.title(col)
    
plt.show()

In [None]:

for index ,col in enumerate(df.select_dtypes('object')):
   
    df[col].value_counts(ascending=False).plot(kind='pie',autopct='%1.1f%%')
    plt.title(col)
    plt.show()


# What is the Marital Status of deceased


In [None]:
deads=df[df['Status']=='Dead']['Marital Status'].value_counts(ascending=False)
ax=sns.barplot(x=deads.index, y=deads.values)
ax.bar_label(ax.containers[0], fontsize=13)
plt.show()

# Encoding 

In [None]:
encoder = LabelEncoder()
for col in df.select_dtypes("object"):
    df[col] =encoder.fit_transform(df[[col]])


# Get correlation 

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(df.corr(numeric_only=True) , annot =True , cmap='coolwarm')
plt.show()

# Scaling data 

In [None]:
scaler=MinMaxScaler()
for col in df.columns :
        df[col]=scaler.fit_transform(df[[col]])

# Split data

In [None]:
x = df.drop(columns="Status")
y = df.loc[:,"Status"]
x.head()

# Training models 

In [None]:
x_train , x_test , y_train , y_test = train_test_split(x,y , test_size=0.2 ,shuffle=True ,
                                                      random_state= 43 , stratify=y)


In [None]:
model_dic={
    "Decision Tree":DecisionTreeClassifier(criterion='entropy'),
    'logistic Regression':LogisticRegression(),
    'Random Forest':RandomForestClassifier(n_estimators=10),
    'Gradient Boosting':GradientBoostingClassifier(n_estimators=100),
    "KNN":KNeighborsClassifier(n_neighbors=3),
    "naive Bayes":GaussianNB(),
    "SVC":SVC(kernel='linear')
}


acc_dic={}


for (i, model) in model_dic.items():
    model.fit(x_train,y_train)
    
    y_pred=model.predict(x_test)
    
    acc_dic[i]={'accuracy_score ':accuracy_score(y_test,y_pred),
                "precision_score"  :precision_score(y_test,y_pred),
                "recall_score"  :recall_score(y_test,y_pred),
                "f1_score"  :f1_score(y_test,y_pred)}
    

In [None]:
acc_dic=pd.DataFrame(acc_dic)

sns.heatmap(acc_dic,cmap="coolwarm",annot=True,linecolor="black",linewidths=1)

In [None]:
sns.set_palette("afmhot")

# Flatten the dictionary
metrics = ['accuracy_score ', 'precision_score', 'recall_score', 'f1_score']

models = list(acc_dic.keys())

values = {metric: [acc_dic[model][metric] for model in models] for metric in metrics}

# Plotting
x = np.arange(len(models))  # the label locations
width = 0.2  # the width of the bars

fig, ax = plt.subplots(figsize=(10, 9))

# Create bars for each metric
for i, metric in enumerate(metrics):
    ax.bar(x + i * width, values[metric], width, label=metric)

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_xlabel('Models')
ax.set_ylabel('Scores')
ax.set_title('Model Performance Comparison')
ax.set_xticks(x + width * (len(metrics) - 1) / 2)
ax.set_xticklabels(models)
ax.legend()




fig.tight_layout()

plt.show()


# Check train error and test error 

In [None]:
for model in model_dic.keys():
    print('-'*50)
    model_dic[model].fit(x_train , y_train)
    print(f'The train error of {model}:{model_dic[model].score(x_train, y_train)}')
    print(f'The test error of{model}: {model_dic[model].score(x_test, y_test)}')
   