# Breast Cancer Prediction - 3 types(Basic, MinMaxScaled, StandardScaled) of comparison

```
Step 1. Data Load & EDA & Preprocessing
Step 2. Visualization
     2-a. Correlation Heatmap - raw data
     2-b. Correlation Heatmap - MinMaxScaled data
     2-c. Correlation Heatmap - StandardScaled data
Step 3. Modeling & Prediction
     3-a. Logistic Regression - raw data
     3-b. SGDClassifier       - raw data
     3-c. Logistic Regression - MinMaxScaled data
     3-d. SGDClassifier       - MinMaxScaled data
     3-e. Logistic Regression - StandardScaled data
     3-f. SGDClassifier       - StandardScaled data
Step 4. Conclusion
```

---
## Step 1. Date Load & EDA

In [None]:
import pandas as pd
from sklearn import datasets

In [None]:
df = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')

In [None]:
df

In [None]:
df.drop(df.columns[-1],axis=1,inplace=True)

In [None]:
df.drop(df.columns[0],axis=1,inplace=True)

In [None]:
df

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df['diagnosis'].unique()

In [None]:
set([1 if x =='M' else 0 for x in df['diagnosis']])

In [None]:
df['diagnosis'] = [1 if x =='M' else 0 for x in df['diagnosis']]

In [None]:
df.info()

---
## Step 2. Visualization

### 2-a. Correlation Heatmap - Raw Data

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
%matplotlib inline
warnings.filterwarnings(action='ignore')

In [None]:
df.groupby('diagnosis').mean()

In [None]:
fig = plt.figure(figsize=(12,4))
sns.heatmap(df.groupby('diagnosis').mean(), cmap='Blues')

### 2-b. Correlation Heatmap - MinMaxScaled Data

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
df.columns[0]

In [None]:
scaler = MinMaxScaler()
temp_y = df.iloc[:,0]
temp_X = df.iloc[:,1:]
scaled_temp_X = scaler.fit_transform(temp_X)

scaled_y = pd.DataFrame(temp_y, columns=[df.columns[0]])
scaled_X = pd.DataFrame(scaled_temp_X, columns=df.columns[1:])

In [None]:
scaled_df = pd.concat([scaled_X, scaled_y], axis=1)

In [None]:
scaled_df

In [None]:
scaled_df.describe()

In [None]:
fig = plt.figure(figsize=(12,4))
sns.heatmap(scaled_df.groupby('diagnosis').mean(), cmap='Blues');

### 2-c. Correlation Heatmap - StandardScaled Data

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
StScaler = StandardScaler()
temp_X = df.iloc[:,1:]
temp_y = df.iloc[:,0]
StScaled_temp_X = StScaler.fit_transform(temp_X)

StScaled_X = pd.DataFrame(StScaled_temp_X, columns=temp_X.columns)
temp_y = pd.DataFrame(temp_y, columns=[df.columns[0]])
StScaled_df = pd.concat([StScaled_X, temp_y], axis=1)

In [None]:
StScaled_df

In [None]:
StScaled_df.groupby('diagnosis').mean()

In [None]:
fig = plt.figure(figsize=(12,4))
sns.heatmap(StScaled_df.groupby('diagnosis').mean(), cmap='Blues');

---
## Step 3. Modeling & Prediction

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df.iloc[:,1:]
y = df.iloc[:,0]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=24)

### 3-a. Logistic Regression - raw data

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
print('Train Score : {}'.format(lr.score(X_train, y_train)))
print('Test Score : {}'.format(lr.score(X_test, y_test)))

### 3-b. SGDClassifier - raw data

In [None]:
from sklearn.linear_model import SGDClassifier

params = []
train_score_li = []
test_score_li = []
best_test_score = 0.1 # temp
best_param = 0

for i in range(100,10000,500):
    sgdc = SGDClassifier(max_iter=i)
    sgdc.fit(X_train, y_train)
    
    train_score = sgdc.score(X_train, y_train)
    test_score = sgdc.score(X_test, y_test)
    print('max iter = {}'.format(i))
    print('Train Score : {}'.format(train_score))
    print('Test Score : {}'.format(test_score))
    print('-----')
    params.append(i)
    train_score_li.append(train_score)
    test_score_li.append(test_score)
    
    if best_test_score < test_score:
        best_test_score = test_score
        best_param = i

In [None]:
plt.plot(params, train_score_li, '--r', label='Train Score')
plt.plot(params, test_score_li, label='Test Score')
plt.legend(loc='best')
plt.show()

In [None]:
print('Best Score(Test Set) : {}'.format(best_test_score))
print('Best parameter(max_iter) : {}'.format(best_param))

### 3-c. Logistic Regression - MinMaxScaled data

In [None]:
MMScaler = MinMaxScaler()
MMScaled_X_train = MMScaler.fit_transform(X_train)
MMScaled_X_test = MMScaler.transform(X_test)

lr.fit(MMScaled_X_train, y_train)
print('Train Score : {}'.format(lr.score(MMScaled_X_train, y_train)))
print('Test Score : {}'.format(lr.score(MMScaled_X_test, y_test)))

### 3-d. SGDClassifier - MinMaxScaled data

In [None]:
params_with_MMS = []
MMScaled_train_score_li = []
MMScaled_test_score_li = []
best_test_score_with_MMS = 0.1 # temp
best_param_with_MMS = 0

for i in range(100,10000,500):
    sgdc = SGDClassifier(max_iter=i)
    sgdc.fit(MMScaled_X_train, y_train)
    
    MMScaled_train_score = sgdc.score(MMScaled_X_train, y_train)
    MMScaled_test_score = sgdc.score(MMScaled_X_test, y_test)
    print('max iter = {}'.format(i))
    print('Train Score : {}'.format(MMScaled_train_score))
    print('Test Score : {}'.format(MMScaled_test_score))
    print('-----')
    params_with_MMS.append(i)
    MMScaled_train_score_li.append(MMScaled_train_score)
    MMScaled_test_score_li.append(MMScaled_test_score)
    
    if best_test_score_with_MMS < MMScaled_test_score:
        best_test_score_with_MMS = MMScaled_test_score
        best_param_with_MMS = i

In [None]:
plt.plot(params_with_MMS, MMScaled_train_score_li, '--r', label='Train Score')
plt.plot(params_with_MMS, MMScaled_test_score_li, label='Test Score')
plt.legend(loc='best')
plt.show()

In [None]:
print('Best Score(Test Set) : {}'.format(best_test_score_with_MMS))
print('Best parameter(max_iter) : {}'.format(best_param_with_MMS))

### 3-e. Logistic Regression - StandardScaled data

In [None]:
StScaler = StandardScaler()
StScaled_X_train = StScaler.fit_transform(X_train)
StScaled_X_test = StScaler.transform(X_test)

In [None]:
lr = LogisticRegression()
lr.fit(StScaled_X_train, y_train)
print('Train Score : {}'.format(lr.score(StScaled_X_train, y_train)))
print('Test Score : {}'.format(lr.score(StScaled_X_test, y_test)))

### 3-f. XGDClassifier - StandardScaled data

In [None]:
params_with_StS = []
StScaled_train_score_li = []
StScaled_test_score_li = []
best_test_score_with_StS = 0.1 # temp
best_param_with_StS = 0

for i in range(100,10000,500):
    sgdc = SGDClassifier(max_iter=i)
    sgdc.fit(StScaled_X_train, y_train)
    
    StScaled_train_score = sgdc.score(StScaled_X_train, y_train)
    StScaled_test_score = sgdc.score(StScaled_X_test, y_test)
    print('max iter = {}'.format(i))
    print('Train Score : {}'.format(StScaled_train_score))
    print('Test Score : {}'.format(StScaled_test_score))
    print('-----')
    params_with_StS.append(i)
    StScaled_train_score_li.append(StScaled_train_score)
    StScaled_test_score_li.append(StScaled_test_score)
    
    if best_test_score_with_StS < StScaled_test_score:
        best_test_score_with_StS = StScaled_test_score
        best_param_with_StS = i

In [None]:
plt.plot(params_with_StS, StScaled_train_score_li, '--r', label='Train Score')
plt.plot(params_with_StS, StScaled_test_score_li, label='Test Score')
plt.legend(loc='best')
plt.show()

In [None]:
print('Best Score(Test Set) : {}'.format(best_test_score_with_StS))
print('Best parameter(max_iter) : {}'.format(best_param_with_StS))

---
### Step 4. Conclusion

1. Logistic Regression
    - with Basic(Raw) Data
        Train Score : 0.94
        Test Score : 0.93
    - with MinMaxScaled Data
        Train Score : 0.97
        Test Score : 0.98
    - with StandardScaled Data
        Train Score : 0.98
        Test Score : 0.95



2. SGDClassifier
    - with Basic(Raw) Data
        Best Score(Test Set) : 0.93
        Best parameter(max_iter) : 7600
    - with MinMaxScaled Data
        Best Score(Test Set) : 0.97
        Best parameter(max_iter) : 600
    - with StandardScaled Data
        Best Score(Test Set) : 0.97
        Best parameter(max_iter) : 6600