# Linear Classifiers - Random Forest Algorithm and its application

<img src='Data/6.PNG'>

## Dataset

<img src='Data/7.PNG'>

## AIM

---

#### Imports

In [55]:
import pandas as pd
import numpy as np

import sklearn
from sklearn import cross_validation, metrics, model_selection
from sklearn.cross_validation import train_test_split, cross_val_predict

from sklearn.ensemble import RandomForestClassifier

#### Loading Data

- Dateset = Breast cancer wisconsin data from UCI

In [56]:
data = pd.read_csv('Data\data3_breast-cancer-wisconsin.data.csv')
data.head()

Unnamed: 0,CodeNumber,ClumpThickness,UniformityCellSize,UniformityCellShape,MarginalAdhesion,SingleEpithelialCellSize,BareNuclei,BlandChromatin,NormalNucleoli,Mitoses,CancerType
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [57]:
data.shape

(699, 11)

In [58]:
data.describe()

Unnamed: 0,CodeNumber,ClumpThickness,UniformityCellSize,UniformityCellShape,MarginalAdhesion,SingleEpithelialCellSize,BlandChromatin,NormalNucleoli,Mitoses,CancerType
count,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0
mean,1071704.0,4.41774,3.134478,3.207439,2.806867,3.216023,3.437768,2.866953,1.589413,2.689557
std,617095.7,2.815741,3.051459,2.971913,2.855379,2.2143,2.438364,3.053634,1.715078,0.951273
min,61634.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,870688.5,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0
50%,1171710.0,4.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0,2.0
75%,1238298.0,6.0,5.0,5.0,4.0,4.0,5.0,4.0,1.0,4.0
max,13454350.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


   **Inferences:**
    
    - BareNuclie column is missing in this, that means it has some missing values. Identify them.
    - Identified: All missing values in BareNuclie column are replaced by '?'. Remove those rows from dataset
    - Mean of CancerType is 2.6 that means more 2's are there than 4's

In [59]:
data = data[data.BareNuclei != '?']
data.shape

(683, 11)

In [60]:
X = data.iloc[:, 1:10]
Y = data.CancerType

#### Train Test Splitting

In [61]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, train_size = 0.70)

#### Fitting the model

In [62]:
model = RandomForestClassifier()
model.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

#### Predictions

In [63]:
Y_hat = model.predict(X_test)

#### Final Combined Dataset

In [64]:
final_data = X_test.copy()
final_data['Y'] = Y_test
final_data['Y_Hat'] = Y_hat
final_data.head()

Unnamed: 0,ClumpThickness,UniformityCellSize,UniformityCellShape,MarginalAdhesion,SingleEpithelialCellSize,BareNuclei,BlandChromatin,NormalNucleoli,Mitoses,Y,Y_Hat
460,5,1,1,3,2,1,1,1,1,2,2
176,2,1,1,1,2,1,3,1,1,2,2
274,3,1,1,1,2,1,3,2,1,2,2
43,5,6,5,6,10,1,3,1,1,4,4
0,5,1,1,1,2,1,3,1,1,2,2


#### Accuracy

In [65]:
metrics.accuracy_score(Y_test, Y_hat)*100

96.58536585365853

## Using Cross Validation instead of Train-Test Splitting

In [124]:
model = RandomForestClassifier()

K = model_selection.KFold(random_state=65, n_splits=16)
Y_hat_cv = model_selection.cross_val_predict(model, X, Y, cv=K)

In [125]:
metrics.accuracy_score(Y,Y_hat_cv)*100

96.778916544655942