# Assigment Task
## Perform PCA on Cancer DataSet and get the best models using the following algorithms
### 1. Decision Tree Classifier
### 2. Logistic Regression
### 3. XGBoost Classifier
### 4. SVC

In [82]:
#Importing necessary libraries
import pandas as pd
import numpy as np

In [83]:
#Loading DataSet
df = pd.read_csv('cancer.csv')


In [84]:
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,points_worst,symmetry_worst,dimension_worst
0,87139402,B,12.32,12.39,78.85,464.1,0.1028,0.06981,0.03987,0.037,...,13.5,15.64,86.97,549.1,0.1385,0.1266,0.1242,0.09391,0.2827,0.06771
1,8910251,B,10.6,18.95,69.28,346.4,0.09688,0.1147,0.06387,0.02642,...,11.88,22.94,78.28,424.8,0.1213,0.2515,0.1916,0.07926,0.294,0.07587
2,905520,B,11.04,16.83,70.92,373.2,0.1077,0.07804,0.03046,0.0248,...,12.41,26.44,79.93,471.4,0.1369,0.1482,0.1067,0.07431,0.2998,0.07881
3,868871,B,11.28,13.39,73.0,384.8,0.1164,0.1136,0.04635,0.04796,...,11.92,15.77,76.53,434.0,0.1367,0.1822,0.08669,0.08611,0.2102,0.06784
4,9012568,B,15.19,13.21,97.65,711.8,0.07963,0.06934,0.03393,0.02657,...,16.2,15.73,104.5,819.1,0.1126,0.1737,0.1362,0.08178,0.2487,0.06766


In [85]:
df.describe()

Unnamed: 0,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,points_worst,symmetry_worst,dimension_worst
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,30371830.0,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,125020600.0,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,8670.0,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,869218.0,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,906024.0,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,8813129.0,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,911320500.0,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [86]:
df.diagnosis.replace(['B','M'],['0','1'] , inplace=True)
df.drop(['id'], axis=1,inplace=True)
len(df.columns)

31

In [87]:
#Check whether teh data is a balanced dataset or not
df.diagnosis.value_counts()


0    357
1    212
Name: diagnosis, dtype: int64

In [88]:
#Seperate data as features and label
features = df.iloc[:,1:31].values
label = df.diagnosis.values


In [89]:
import warnings
warnings.filterwarnings('ignore')

### Let us start with PCA

In [90]:

# Rules/Guidelines:
# 1. Always perform Standardization if you are planning to do PCA (MANDATORY)
# 2. For PCA, n_components should be less than no of features
# 3. n_components can be judged using PN's Technique
#.   PN's Technique says that calc the principal component for n_components = n_features
#.   Count the number of components greater than equal to 75%. The same count will be your
#.   n_components

In [91]:
#Step1: Perform Standardization
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
features = sc.fit_transform(features)
ideal_sc = sc

In [92]:
#Step2: Identify the ideal number of components to work with
#PNs Technique
from sklearn.decomposition import PCA
principalComponents = PCA(n_components=30)  #Here n_components = len(features)
principalComponents.fit(features,label)

PCA(copy=True, iterated_power='auto', n_components=30, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [93]:
#Lets get the ideal components (0.75 or later; else go with PC=1)
principalComponents.explained_variance_ratio_

array([4.42720256e-01, 1.89711820e-01, 9.39316326e-02, 6.60213492e-02,
       5.49576849e-02, 4.02452204e-02, 2.25073371e-02, 1.58872380e-02,
       1.38964937e-02, 1.16897819e-02, 9.79718988e-03, 8.70537901e-03,
       8.04524987e-03, 5.23365745e-03, 3.13783217e-03, 2.66209337e-03,
       1.97996793e-03, 1.75395945e-03, 1.64925306e-03, 1.03864675e-03,
       9.99096464e-04, 9.14646751e-04, 8.11361259e-04, 6.01833567e-04,
       5.16042379e-04, 2.72587995e-04, 2.30015463e-04, 5.29779290e-05,
       2.49601032e-05, 4.43482743e-06])

In [94]:

#Based on above variance of each components, we conclude to go for n_components = 1

In [95]:
#Step3: Apply PCA with correct number of components
from sklearn.decomposition import PCA
principalComponentsFinal = PCA(n_components=1)  
principalComponentsFinal.fit(features,label)

PCA(copy=True, iterated_power='auto', n_components=1, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [96]:
principalComponentsFinal.explained_variance_ratio_

array([0.44272026])

In [97]:
#Step4: Transform the feature set
finalFeatures = principalComponentsFinal.transform(features)


## 1. Decision Tree Classifier

In [98]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [99]:
def determine_RS(features, label,model):
    max_val = -10
    max_train = -10
    ideal_model = None
    random_state = 0
    xtrain=xtest=ytrain=ytest = 0
    hit = 0 # This flag is fired when the test score is greater than train score
    #Since there are 50 records lets try iteration over 200
    for i in range(1,51):
        X_train,X_test,y_train,y_test = train_test_split(features,
                                                        label,
                                                        test_size=0.2,
                                                        random_state = i)


        model.fit(X_train,y_train)

        train_score = model.score(X_train,y_train)
        test_score = model.score(X_test,y_test)

        if test_score > train_score:
            hit = 1
            #print ("test %r " % test_score)
            #print ("max val %r " % max_val)
            if test_score > max_val:
                max_val = test_score
                ideal_model = model
                random_state = i
                max_train = train_score
                xtrain,xtest,ytrain,ytest = X_train,X_test,y_train,y_test
            print("Test: {} Train: {} RS: {}".format(test_score,train_score,i))
    print ("Ideal model inside %r " % (ideal_model))
    return [max_val,max_train, random_state, hit, ideal_model,xtrain,xtest,ytrain,ytest]
model = DecisionTreeClassifier(max_depth=4)
max_test_score_DT ,max_train_score_DT, random_state_DT, hit_DT ,ideal_model_DT,X_train_DT,X_test_DT,y_train_DT,y_test_DT= determine_RS(finalFeatures, label, model)

print ("Ideal %r"  %ideal_model_DT)

print ("The random state for the max test score  of %r is %r  " % (max_test_score_DT, random_state_DT))
if hit_DT:
    print ("Since test score is greater than train score this model is good")

Test: 0.9385964912280702 Train: 0.9340659340659341 RS: 9
Test: 0.9473684210526315 Train: 0.9296703296703297 RS: 10
Test: 0.9385964912280702 Train: 0.9318681318681319 RS: 14
Test: 0.9298245614035088 Train: 0.9252747252747253 RS: 21
Test: 0.9298245614035088 Train: 0.9274725274725275 RS: 31
Test: 0.9385964912280702 Train: 0.9274725274725275 RS: 36
Test: 0.956140350877193 Train: 0.9296703296703297 RS: 40
Test: 0.9385964912280702 Train: 0.9274725274725275 RS: 41
Test: 0.9473684210526315 Train: 0.9318681318681319 RS: 43
Test: 0.9298245614035088 Train: 0.9296703296703297 RS: 50
Ideal model inside DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best') 


In [100]:
print(ideal_model_DT.score(X_train_DT,y_train_DT))
print(ideal_model_DT.score(X_test_DT,y_test_DT))

0.9230769230769231
0.956140350877193


In [101]:

#Creating a dataframe with test data and predicted output value
import math
PredictedProfit_DT = ideal_model_DT.predict(X_test_DT)
predict_dataset_DT = pd.DataFrame({'X_test_DT': list(X_test_DT), 'PredictedProfit_DT': list(PredictedProfit_DT)}, columns=['X_test_DT', 'PredictedProfit_DT'])
predict_dataset_DT

Unnamed: 0,X_test_DT,PredictedProfit_DT
0,[8.630755859096036],1
1,[-0.7091148317236281],0
2,[2.69722375439956],1
3,[-1.0774513298933344],0
4,[-0.09866733268842949],0
...,...,...
109,[0.31443822596145504],0
110,[7.676149471669649],1
111,[-2.57888325196851],0
112,[2.1432985003775493],1


In [102]:
# Print root mean squared error (RMSE) from Linear Regression.
#Creating a dataframe with test data and predicted output value
from sklearn.metrics import mean_squared_error
MSE_DT = mean_squared_error(y_test_DT, PredictedProfit_DT)
print ("Print root mean squared error (RMSE) from DT Model Tree based on all parameters is  %r " % math.sqrt(MSE_DT))
RMSE_DT =  math.sqrt(MSE_DT)

Print root mean squared error (RMSE) from DT Model Tree based on all parameters is  0.20942695414584775 


## 2. Logistic Regression

In [103]:
model = LogisticRegression()
max_test_score_LR ,max_train_score_LR, random_state_LR, hit_LR ,ideal_model_LR,X_train_LR,X_test_LR,y_train_LR,y_test_LR= determine_RS(finalFeatures, label, model)

print ("Ideal %r"  %ideal_model_LR)

print ("The random state for the max test score  of %r is %r  " % (max_test_score_LR, random_state_LR))
if hit_LR:
    print ("Since test score is greater than train score this model is good")

Test: 0.9473684210526315 Train: 0.9076923076923077 RS: 1
Test: 0.9385964912280702 Train: 0.9098901098901099 RS: 6
Test: 0.9298245614035088 Train: 0.9142857142857143 RS: 9
Test: 0.9473684210526315 Train: 0.9032967032967033 RS: 10
Test: 0.9122807017543859 Train: 0.9120879120879121 RS: 11
Test: 0.9122807017543859 Train: 0.9120879120879121 RS: 13
Test: 0.9298245614035088 Train: 0.9120879120879121 RS: 14
Test: 0.9298245614035088 Train: 0.9098901098901099 RS: 18
Test: 0.9473684210526315 Train: 0.9076923076923077 RS: 21
Test: 0.956140350877193 Train: 0.9076923076923077 RS: 22
Test: 0.9210526315789473 Train: 0.9098901098901099 RS: 29
Test: 0.9298245614035088 Train: 0.9120879120879121 RS: 31
Test: 0.9210526315789473 Train: 0.9142857142857143 RS: 33
Test: 0.9385964912280702 Train: 0.9098901098901099 RS: 35
Test: 0.9298245614035088 Train: 0.9076923076923077 RS: 36
Test: 0.9385964912280702 Train: 0.9120879120879121 RS: 40
Test: 0.9385964912280702 Train: 0.9098901098901099 RS: 41
Test: 0.9385964912

In [104]:

#Creating a dataframe with test data and predicted output value
import math
PredictedProfit_LR = ideal_model_LR.predict(X_test_LR)
predict_dataset_LR = pd.DataFrame({'X_test_LR': list(X_test_LR), 'PredictedProfit_LR': list(PredictedProfit_LR)}, columns=['X_test_LR', 'PredictedProfit_LR'])
predict_dataset_LR

Unnamed: 0,X_test_LR,PredictedProfit_LR
0,[5.30714002200119],1
1,[-4.0275423881080705],0
2,[-1.4087712143990578],0
3,[1.4320694267974379],1
4,[-0.6198387404461265],0
...,...,...
109,[-1.9920347221454104],0
110,[-3.2998420114682863],0
111,[-2.1364452772841025],0
112,[-3.6546465282535463],0


In [105]:
# Print root mean squared error (RMSE) from Logistic Regression.
from sklearn.metrics import mean_squared_error
MSE_LR = mean_squared_error(y_test_LR, PredictedProfit_LR)
print ("Print root mean squared error (RMSE) from LR Model Tree based on all parameters is  %r " % math.sqrt(MSE_LR))
RMSE_LR =  math.sqrt(MSE_LR)

Print root mean squared error (RMSE) from LR Model Tree based on all parameters is  0.20942695414584775 


## 3.  XG Boost Classifier

In [106]:
from xgboost import XGBClassifier
modelXG = XGBClassifier(learning_rate=0.01)
max_test_score_XG ,max_train_score_XG, random_state_XG, hit_XG ,ideal_model_XG,X_train_XG,X_test_XG,y_train_XG,y_test_XG= determine_RS(finalFeatures, label, model)

print ("Ideal %r"  %ideal_model_XG)

print ("The random state for the max test score  of %r is %r  " % (max_test_score_XG, random_state_XG))
if hit_XG:
    print ("Since test score is greater than train score this model is good")

ModuleNotFoundError: No module named 'xgboost'

In [107]:

#Creating a dataframe with test data and predicted output value
import math
PredictedProfit_XG = ideal_model_XG.predict(X_test_XG)
predict_dataset_XG = pd.DataFrame({'X_test_XG': list(X_test_XG), 'PredictedProfit_XG': list(PredictedProfit_XG)}, columns=['X_test_XG', 'PredictedProfit_XG'])
predict_dataset_XG


NameError: name 'ideal_model_XG' is not defined

## 4. SVC

In [108]:
from sklearn.svm import SVC
model = SVC()
max_test_score_SVC ,max_train_score_SVC, random_state_SVC, hit_SVC ,ideal_model_SVC,X_train_SVC,X_test_SVC,y_train_SVC,y_test_SVC= determine_RS(finalFeatures, label, model)

print ("Ideal %r"  %ideal_model_SVC)

print ("The random state for the max test score  of %r is %r  " % (max_test_score_SVC, random_state_SVC))
if hit_SVC:
    print ("Since test score is greater than train score this model is good")

Test: 0.9298245614035088 Train: 0.9098901098901099 RS: 6
Test: 0.9210526315789473 Train: 0.9120879120879121 RS: 9
Test: 0.9385964912280702 Train: 0.9120879120879121 RS: 10
Test: 0.9210526315789473 Train: 0.9164835164835164 RS: 11
Test: 0.9298245614035088 Train: 0.9186813186813186 RS: 13
Test: 0.9298245614035088 Train: 0.9142857142857143 RS: 14
Test: 0.9210526315789473 Train: 0.9164835164835164 RS: 16
Test: 0.9210526315789473 Train: 0.9164835164835164 RS: 18
Test: 0.9385964912280702 Train: 0.9120879120879121 RS: 21
Test: 0.9298245614035088 Train: 0.9120879120879121 RS: 22
Test: 0.9210526315789473 Train: 0.9164835164835164 RS: 24
Test: 0.9210526315789473 Train: 0.9164835164835164 RS: 25
Test: 0.9210526315789473 Train: 0.9164835164835164 RS: 29
Test: 0.9298245614035088 Train: 0.9098901098901099 RS: 33
Test: 0.9385964912280702 Train: 0.9164835164835164 RS: 36
Test: 0.9473684210526315 Train: 0.9098901098901099 RS: 40
Test: 0.9210526315789473 Train: 0.9120879120879121 RS: 41
Test: 0.94736842

In [109]:
#Creating a dataframe with test data and predicted output value
import math
PredictedProfit_SVC = ideal_model_SVC.predict(X_test_SVC)
predict_dataset_SVC = pd.DataFrame({'X_test_SVC': list(X_test_SVC), 'PredictedProfit_SVC': list(PredictedProfit_SVC)}, columns=['X_test_SVC', 'PredictedProfit_SVC'])
predict_dataset_SVC

Unnamed: 0,X_test_SVC,PredictedProfit_SVC
0,[8.630755859096036],1
1,[-0.7091148317236281],0
2,[2.69722375439956],1
3,[-1.0774513298933344],0
4,[-0.09866733268842949],0
...,...,...
109,[0.31443822596145504],0
110,[7.676149471669649],1
111,[-2.57888325196851],0
112,[2.1432985003775493],1


In [110]:

# Print root mean squared error (RMSE) from Logistic Regression.
from sklearn.metrics import mean_squared_error
MSE_SVC = mean_squared_error(y_test_SVC, PredictedProfit_SVC)
print ("Print root mean squared error (RMSE) from SVC Model Tree based on all parameters is  %r " % math.sqrt(MSE_SVC))
RMSE_SVC =  math.sqrt(MSE_SVC)

Print root mean squared error (RMSE) from SVC Model Tree based on all parameters is  0.22941573387056177 


### Conclusion: PCA for all three models

In [111]:
data = {'Model':['RS','Test Score ' ,'Train Score' ,' Model is good', 'RMSE'],
        'Decision Tree Classifier':[random_state_DT, max_test_score_DT,max_train_score_DT, 'Yes',RMSE_DT ],
        'Logistic Regression' :[random_state_LR, max_test_score_LR,max_train_score_LR, 'Yes',RMSE_LR],
        'XGBoost':['NA','NA',  'NA','NA', 'NA'],
        'SVC':[random_state_SVC, max_test_score_SVC,max_train_score_SVC, 'Yes',RMSE_SVC]}
 
# Create DataFrame
df = pd.DataFrame(data)

In [112]:
df

Unnamed: 0,Model,Decision Tree Classifier,Logistic Regression,XGBoost,SVC
0,RS,40,22,,40
1,Test Score,0.95614,0.95614,,0.947368
2,Train Score,0.92967,0.907692,,0.90989
3,Model is good,Yes,Yes,,Yes
4,RMSE,0.209427,0.209427,,0.229416
