In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.datasets import make_classification
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
data = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')
data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.describe()

## Exploratory Data Analysis

In [None]:
data.isnull().sum()

In [None]:
data.drop(['id', 'Unnamed: 32'], inplace=True, axis=1)

In [None]:
data[['diagnosis']].value_counts()

In [None]:
ax = data[['diagnosis']].value_counts().plot(kind='bar', figsize=(8, 6), title="Diagnosis Counts")
ax.set_xlabel("Benign & Malignant")
ax.set_ylabel("Frequency")

In [None]:
data['diagnosis'] = data['diagnosis'].map( {'B': 1, 'M': 0} )

In [None]:
plt.figure(figsize=(16,9))
sns.heatmap(data.corr(), annot=True)
plt.title("Correlation between Features", fontsize=23)
plt.show()

In [None]:
data[data.columns[0:]].corr()['diagnosis'][:].sort_values(ascending=False)

In [None]:
fig, axs = plt.subplots( figsize=(15,8))
data.hist(ax=axs)
plt.tight_layout()

## POPING OUT 19 ROWS FROM THE DATAFRAME

In [None]:
def pop(df, values, axis=1):
    if axis == 0:
        if isinstance(values, (list, tuple)):
            popped_rows = df.loc[values]
            df.drop(values, axis=0, inplace=True)
            return popped_rows
        elif isinstance(values, (int)):
            popped_row = df.loc[values].to_frame().T
            df.drop(values, axis=0, inplace=True)
            return popped_row
        else:
            print('values parameter needs to be a list, tuple or int.')
    elif axis == 1:
        # current df.pop(values) logic here
        return df.pop(values)

In [None]:
poped_values = pop(data, [0,1,2,3,4,5,6,7,8,9,10,12,13,14,15,16,17,18,19], axis=0)
poped_values

## Feature Selection - Dimentionality Reduction

In [None]:
feature_cols = [c for c in data.columns if c not in ['diagnosis']]

In [None]:
X = data[feature_cols]
y = data['diagnosis']

test_sample = poped_values[feature_cols]
test_result = poped_values['diagnosis']

In [None]:
# to visualise al the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)

feature_sel_model = SelectFromModel(Lasso(alpha=0.005, random_state=23, max_iter=3000,tol=30.295954819192826))
feature_sel_model.fit(data[feature_cols], data['diagnosis'])

In [None]:
feature_sel_model.get_support()

In [None]:
# let's print the number of total and selected features

# this is how we can make a list of the selected features
selected_feat = data[feature_cols].columns[(feature_sel_model.get_support())]

# let's print some stats
print('total features: {}'.format((data[feature_cols].shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print(selected_feat)
#print('features with coefficients shrank to zero: {}'.format(np.sum(sel_.estimator_.coef_ == 0)))

In [None]:
X = X[selected_feat]
y = data['diagnosis'] 

test_sample = poped_values[selected_feat]
test_result = poped_values['diagnosis']

### Modelling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
first_model = RandomForestClassifier(n_estimators = 10, criterion = 'entropy',
                                  random_state = 42)
first_model.fit(X_train, y_train)

pred_y = first_model.predict(X_test)

preds = first_model.predict(X_train)

print("Accuracy:", accuracy_score(y_test, pred_y))

# Testing our Popped Data on Our Model

In [None]:
new_preds = first_model.predict(test_sample)
print("Accuracy:", accuracy_score(test_result, new_preds))

In [None]:
gbrt = GradientBoostingClassifier(random_state = 0, max_depth = 1)
gbrt.fit(X_train, y_train)


print("Accuracy on training set:", gbrt.score(X_train, y_train))
print("Accuracy on test set:", gbrt.score(X_test, y_test))

## Model 2

In [None]:
second_model = DecisionTreeClassifier(max_depth=3, random_state=42)

second_model.fit(X_train, y_train)

pred_n = second_model.predict(X_test)

print(accuracy_score(y_test, pred_n))

In [None]:
#Test 2
next_preds = first_model.predict(test_sample)
print("Accuracy:", accuracy_score(test_result, next_preds))

# Model_3

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

first_model = RandomForestClassifier(n_estimators = 10, criterion = 'entropy',
                                  random_state = 42)
first_model.fit(X_train, y_train)

pred_y = first_model.predict(X_test)

preds = first_model.predict(X_train)

print("Accuracy:", accuracy_score(y_test, pred_y))

## Model 4

In [None]:
#Using normalization


Xx = (X - np.min(X))/(np.max(X)-np.min(X)).values

X_train, X_test, y_train, y_test = train_test_split(Xx, y, test_size=0.3)


logreg = linear_model.LogisticRegression(random_state = 42,max_iter= 200)
print("test accuracy: {}% ".format((logreg.fit(X_train, y_train).score(X_test, y_test))*100))
print("train accuracy: {}%".format((logreg.fit(X_train, y_train).score(X_train, y_train))*100))