In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('../input/breast-cancer/Breast_cancer_data.csv')

In [None]:
df.head()

In [None]:
df.info()

## Univariate Analysis

In [None]:
df.describe()

### Histogram
In the univariate analysis, we use histograms for analyzing and visualizing frequency distribution.

In [None]:
df.hist()
plt.plot()

## Bivariate Analysis
It will show the relationship between two variables.

In [None]:
import seaborn as sns

In [None]:
sns.pairplot(data = df, palette = 'pastel')

## Missing Values

In [None]:
df.isnull().sum()

Our dataset has no missing values which is good for our model.

## Outliers

In [None]:
sns.set_theme(style="whitegrid")
plt.figure(figsize=(15,12))
sns.boxplot(data =df, orient="h")

Looking at the box plot, it seems that the variables *mean_area*, have outlier present in the variables. These outliers value needs to be teated

In [None]:
def remove_outliers_using_quantiles(qu_dataset, qu_field, qu_fence):
    a = qu_dataset[qu_field].describe()
    
    iqr = a["75%"] - a["25%"]
    print("interquartile range:", iqr)
    
    upper_inner_fence = a["75%"] + 1.5 * iqr
    lower_inner_fence = a["25%"] - 1.5 * iqr
    print("upper_inner_fence:", upper_inner_fence)
    print("lower_inner_fence:", lower_inner_fence)
    
    upper_outer_fence = a["75%"] + 3 * iqr
    lower_outer_fence = a["25%"] - 3 * iqr
    print("upper_outer_fence:", upper_outer_fence)
    print("lower_outer_fence:", lower_outer_fence)
    
    count_over_upper = len(qu_dataset[qu_dataset[qu_field]>upper_inner_fence])
    count_under_lower = len(qu_dataset[qu_dataset[qu_field]<lower_inner_fence])
    percentage = 100 * (count_under_lower + count_over_upper) / a["count"]
    print("percentage of records out of inner fences: %.2f"% (percentage))
    
    count_over_upper = len(qu_dataset[qu_dataset[qu_field]>upper_outer_fence])
    count_under_lower = len(qu_dataset[qu_dataset[qu_field]<lower_outer_fence])
    percentage = 100 * (count_under_lower + count_over_upper) / a["count"]
    print("percentage of records out of outer fences: %.2f"% (percentage))
    
    if qu_fence == "inner":
        output_dataset = qu_dataset[qu_dataset[qu_field]<=upper_inner_fence]
        output_dataset = output_dataset[output_dataset[qu_field]>=lower_inner_fence]
    elif qu_fence == "outer":
        output_dataset = qu_dataset[qu_dataset[qu_field]<=upper_outer_fence]
        output_dataset = output_dataset[output_dataset[qu_field]>=lower_outer_fence]
    else:
        output_dataset = qu_dataset
    
    print("length of input dataframe:", len(qu_dataset))
    print("length of new dataframe after outlier removal:", len(output_dataset))
    
    return output_dataset
df.dropna(inplace=True)
new_df = remove_outliers_using_quantiles(df, "mean_area", "inner")

In [None]:
new_df['mean_area']

In [None]:
plt.figure(figsize=(15,12))
sns.boxplot(x='mean_area', data = new_df, orient="h")

In [None]:
new_df.head()

In [None]:
new_df.info()

As we can see, after removing the outliers our dataset has reduced I think which is not good as we already have less Data. We will train model using both the datasets and will see the results.

## Correlation

In [None]:
corrmat = df.corr()
f, ax = plt.subplots(figsize =(12, 12))
sns.heatmap(corrmat, ax = ax, annot=True, cmap ="YlGnBu", linewidths = 0.1)

From headtmap we can clearly see that many features are strongly correlated to each other like *mean_radius* and *mean_perimeter*.

## Model Training

Well, we have visualized the data now its time to train the model.

In [None]:
### Creating Features
y = df['diagnosis']
features = ['mean_radius', 'mean_texture', 'mean_perimeter', 'mean_area', 'mean_smoothness']
X = df[features]

### Train Test Split
Firstly, we will split our dataset into train and test data. As we have two dataset we will apply this techinque on both of them.


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=10)

In [None]:
X_train.shape

##  Algorithm For Training Model

In [None]:
#From sklearn import Random Forest classifier
from sklearn.ensemble import RandomForestClassifier
rf_random = RandomForestClassifier()

In [None]:
##Hyperparamters
n_estimators = [int(x) for x in np.linspace(start=100, stop=1200, num=12)]
print(n_estimators)

In [None]:
n_estimators = [int(x) for x in np.linspace(start=100, stop=1200, num=12)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(5, 30, num=8)]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10] 

In [None]:
from sklearn.model_selection import RandomizedSearchCV
random_grid = {'n_estimators' : n_estimators,
               'max_features' : max_features,
               'max_depth' : max_depth,
               'min_samples_split' : min_samples_split,
               'min_samples_leaf' : min_samples_leaf
              }
print(random_grid)

In [None]:
rf = RandomForestClassifier()

In [None]:
rf_random = RandomizedSearchCV(estimator = rf,
                               param_distributions = random_grid,
                               scoring='neg_mean_squared_error',
                               n_iter=10, cv=5, verbose=2, 
                               random_state = 42, n_jobs=1)

In [None]:
rf_random.fit(X_train, y_train)

In [None]:
prediction = rf_random.predict(X_val)

In [None]:
prediction

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
cm = confusion_matrix(y_val, prediction)
acc = accuracy_score(y_val, prediction)
print(cm)
print(acc)

After Using the Random Forest Classifier our model is giving 90% accuracy and its pretty good. Now Lets try this on the dataset which does not has outliers.

### Model without Outliers

In [None]:
new_y = new_df['diagnosis']
new_X = new_df[features]

In [None]:
from sklearn.model_selection import train_test_split
new_X_train, new_X_val, new_y_train, new_y_val = train_test_split(new_X, new_y, random_state=10)

In [None]:
new_X_train.shape

In [None]:
#From sklearn import Random Forest classifier
from sklearn.ensemble import RandomForestClassifier
rf_random_new = RandomForestClassifier()

In [None]:
n_estimators = [int(x) for x in np.linspace(start=100, stop=1200, num=12)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(5, 30, num=8)]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10] 

In [None]:
from sklearn.model_selection import RandomizedSearchCV
random_grid = {'n_estimators' : n_estimators,
               'max_features' : max_features,
               'max_depth' : max_depth,
               'min_samples_split' : min_samples_split,
               'min_samples_leaf' : min_samples_leaf
              }
print(random_grid)

In [None]:
rf_new = RandomForestClassifier()

In [None]:
rf_random_new = RandomizedSearchCV(estimator = rf_new,
                               param_distributions = random_grid,
                               scoring='neg_mean_squared_error',
                               n_iter=10, cv=5, verbose=2, 
                               random_state = 42, n_jobs=1)

In [None]:
rf_random_new.fit(new_X_train, new_y_train)

In [None]:
new_predictions = rf_random_new.predict(new_X_val)

In [None]:
new_predictions

In [None]:
new_cm = confusion_matrix(new_y_val, new_predictions)
new_acc = accuracy_score(new_y_val, new_predictions)
print(new_cm)
print(new_acc)

Using dataset without outliers we are getting 91% accuracy which is greater than accuracy with outliers. So now i will save the model and deploy it.

### Saving Model

In [None]:
import pickle

In [None]:
file = open('random_forest_model.pkl', 'wb')
pickle.dump(rf_random, file)