In [2]:
import numpy as np
import pandas as pd

# Load the dataset
df = pd.read_csv('data.csv')

# Drop unnecessary column
df.drop('Unnamed: 32', axis=1, inplace=True)

# Drop rows where 'diagnosis' is missing
df.dropna(subset=['diagnosis'], inplace=True)

# Encode the target variable: M = 1 (Malignant), B = 0 (Benign)
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})

# Basic EDA
print(df['diagnosis'].value_counts())
print(df.isnull().sum())
print(df.duplicated().sum())
print(df.info())

# Prepare features and target
X = df.drop('diagnosis', axis=1)
y = df['diagnosis']

# Split data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

# Train the logistic regression model
from sklearn.linear_model import LogisticRegression
lg = LogisticRegression()
lg.fit(X_train, y_train)

# Make predictions
y_pred = lg.predict(X_test)

# Show predictions
print(y_pred)


diagnosis
0    357
1    212
Name: count, dtype: int64
id                         0
diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64
0
<cl

In [3]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.9736842105263158

In [23]:
X_train[50]

array([-0.23677791,  0.28663239,  2.49335462,  0.19871174,  0.18367474,
       -0.90194811, -1.0983827 , -0.52957767, -0.54946804,  0.14937196,
       -1.36310222,  0.23363719,  0.7840929 ,  0.08929566,  0.11250971,
       -0.05176882, -0.61149611, -0.24707604, -0.44645412,  1.33976882,
       -0.74367977,  0.21325459,  1.87743187,  0.08989387,  0.09595819,
       -0.7256816 , -0.99507862, -0.57319785, -0.74220784,  0.51992499,
       -1.24621394])

In [17]:
input_text =([ 6.63966343,  1.9769275 ,  0.31327591,  1.90233452,  2.05786602,
        0.0788037 ,  0.26483378,  0.86043609,  1.76809679, -0.93595811,
       -0.85350634,  0.92729216,  0.29254864,  0.81382428,  0.88628269,
       -0.49847317, -0.26285216, -0.03032279,  0.4399518 , -1.25977636,
       -0.5802499 ,  1.96523962,  1.07854631,  1.83265153,  2.01924985,
        0.22847029,  0.37042607,  0.79030033,  1.74421993, -1.00983488,
       -0.53947146])

np_df = np.asarray(input_text)
prediction = lg.predict(np_df.reshape(1,-1))

if prediction[0]==1:
  print('Cancer')
else:
  print('Not Cancer')

Cancer


In [25]:
import pickle
pickle.dump(lg,open('model.pkl','wb'))
pickle.dump(sc, open('scaler.pkl', 'wb'))