In [1]:
#Import dependencies

#Data manipulation
import pandas as pd

#Calculating goodman Kruskl Gamma
from scipy.stats import kendalltau

#Logistic Regression
import statsmodels.api as sm
import statsmodels.formula.api as smf

#Machine learning
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [2]:
#Reading the data
df=pd.read_csv("C:\\Users\\HP\\Downloads\\data.csv")

In [3]:
#Checking the first 5 rows
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [4]:
df.drop(['id','Unnamed: 32'], axis=1, inplace=True)

In [5]:
#Replace M's with 1 and B's with 0
df.diagnosis.replace({"M":1,"B":0},inplace=True)
df.diagnosis.unique()

array([1, 0], dtype=int64)

In [6]:
X=df.drop(columns='diagnosis')
Y=df['diagnosis']

In [7]:
#Calculating Goodman Kruskal Gamma
gammas=[]

In [8]:
for col in X.columns:
    tau, pvalue = kendalltau(X[col], Y)
    gamma = 2 * tau / (1 + tau)
    gammas.append(gamma)

In [9]:
print(gammas)

[0.7492820468507292, 0.5482461588914271, 0.7591322867166297, 0.7499939113137851, 0.4662963388686181, 0.6648606098351186, 0.749569514832059, 0.7774113439577374, 0.42757919323590776, -0.043262238448845344, 0.6703870189562042, 0.031248080809407247, 0.6800793686256391, 0.737151757669114, -0.08911277847895073, 0.47458053227422475, 0.5554202811957164, 0.5709648834318791, -0.16321608813809974, 0.2827811766282231, 0.7835991405068972, 0.5607929954391039, 0.7885239373904993, 0.7828008883115565, 0.5162679822265422, 0.6630636302318272, 0.7316942827224027, 0.7798110407145356, 0.48985960164351844, 0.40582894111366247]


In [10]:
threshold = 0.4
X_filtered = X.loc[:, [gamma >= threshold for gamma in gammas]]

In [11]:
print(X_filtered.columns)

Index(['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'radius_se', 'perimeter_se',
       'area_se', 'compactness_se', 'concavity_se', 'concave points_se',
       'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst',
       'smoothness_worst', 'compactness_worst', 'concavity_worst',
       'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object')


In [12]:
X_filtered.head()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,radius_se,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,1.095,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.5435,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.7456,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.4956,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.7572,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [13]:
#dropping the irrelevant columns such as perimeter and area will depend on radius , the worst factors are a subsets of mean factors so we might drop them as well

In [14]:
col1=['perimeter_mean', 'area_mean', 'perimeter_se',
       'area_se','concave points_mean', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst',
    'smoothness_worst', 'compactness_worst', 'concavity_worst',
       'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst', 'concave points_se']

In [15]:
X1=X_filtered.drop(col1, axis=1)

In [16]:
X1

Unnamed: 0,radius_mean,texture_mean,smoothness_mean,compactness_mean,concavity_mean,symmetry_mean,radius_se,compactness_se,concavity_se
0,17.99,10.38,0.11840,0.27760,0.30010,0.2419,1.0950,0.04904,0.05373
1,20.57,17.77,0.08474,0.07864,0.08690,0.1812,0.5435,0.01308,0.01860
2,19.69,21.25,0.10960,0.15990,0.19740,0.2069,0.7456,0.04006,0.03832
3,11.42,20.38,0.14250,0.28390,0.24140,0.2597,0.4956,0.07458,0.05661
4,20.29,14.34,0.10030,0.13280,0.19800,0.1809,0.7572,0.02461,0.05688
...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,0.11100,0.11590,0.24390,0.1726,1.1760,0.02891,0.05198
565,20.13,28.25,0.09780,0.10340,0.14400,0.1752,0.7655,0.02423,0.03950
566,16.60,28.08,0.08455,0.10230,0.09251,0.1590,0.4564,0.03731,0.04730
567,20.60,29.33,0.11780,0.27700,0.35140,0.2397,0.7260,0.06158,0.07117


In [17]:
X1['diagnosis']=Y

In [18]:
X1.head()

Unnamed: 0,radius_mean,texture_mean,smoothness_mean,compactness_mean,concavity_mean,symmetry_mean,radius_se,compactness_se,concavity_se,diagnosis
0,17.99,10.38,0.1184,0.2776,0.3001,0.2419,1.095,0.04904,0.05373,1
1,20.57,17.77,0.08474,0.07864,0.0869,0.1812,0.5435,0.01308,0.0186,1
2,19.69,21.25,0.1096,0.1599,0.1974,0.2069,0.7456,0.04006,0.03832,1
3,11.42,20.38,0.1425,0.2839,0.2414,0.2597,0.4956,0.07458,0.05661,1
4,20.29,14.34,0.1003,0.1328,0.198,0.1809,0.7572,0.02461,0.05688,1


In [19]:
y=X1['diagnosis']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size=0.3)

In [21]:
cols=X1.columns.drop('diagnosis')
formula = 'diagnosis ~ ' + ' + '.join(cols)
print(formula, '\n')

diagnosis ~ radius_mean + texture_mean + smoothness_mean + compactness_mean + concavity_mean + symmetry_mean + radius_se + compactness_se + concavity_se 



In [22]:
model = smf.glm(formula=formula, data=X_train, family=sm.families.Binomial())
logistic_fit=model.fit()

In [23]:
print(logistic_fit.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:              diagnosis   No. Observations:                  398
Model:                            GLM   Df Residuals:                      388
Model Family:                Binomial   Df Model:                            9
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -44.688
Date:                Thu, 20 Apr 2023   Deviance:                       89.376
Time:                        13:16:28   Pearson chi2:                     165.
No. Iterations:                     9   Pseudo R-squ. (CS):             0.6619
Covariance Type:            nonrobust                                         
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept          -33.4296      7.438  

In [24]:
predictions = logistic_fit.predict(X_test)

In [25]:
print(predictions)

181    1.000000
196    0.997193
472    0.011588
302    1.000000
414    0.968989
         ...   
35     0.998371
197    0.971325
161    0.995585
284    0.000282
255    0.557931
Length: 171, dtype: float64


In [26]:
predictions_binary = [ 0 if x < 0.5 else 1 for x in predictions]

In [27]:
print(predictions_binary)

[1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1]


In [28]:
cm= confusion_matrix(y_test,predictions_binary)

true_negative = cm[0][0]
false_positive = cm[0][1]
false_negative = cm[1][0]
true_positive = cm[1][1]

print('Confusion Matrix: \n', cm, '\n')

print('True Negative:', true_negative)
print('False Positive:', false_positive)
print('False Negative:', false_negative)
print('True Positive:', true_positive)
print('Correct Predictions', 
      round((true_negative + true_positive) / len(predictions_binary) * 100, 1), '%')

Confusion Matrix: 
 [[98  5]
 [ 1 67]] 

True Negative: 98
False Positive: 5
False Negative: 1
True Positive: 67
Correct Predictions 96.5 %
