<a href="https://colab.research.google.com/github/reyjm111/Cancer-Detection/blob/main/Lung_Cancer_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Lung Cancer Detection


### Import Data

In [50]:
import pandas as pd
data = pd.read_csv("https://raw.githubusercontent.com/reyjm111/Cancer-Detection/main/survey%20lung%20cancer.csv")

### Turn labels into binary numbers

In [51]:
data['LUNG_CANCER'] = data['LUNG_CANCER'].replace('YES', 2)
data['LUNG_CANCER'] = data['LUNG_CANCER'].replace('NO', 1)

### Drop unnecessary data columns and split data

In [52]:
from sklearn.model_selection import train_test_split

X = data.drop(['PEER_PRESSURE', 'ALCOHOL CONSUMING', 'GENDER', 'AGE', 'LUNG_CANCER'], axis = 1)
y = data['LUNG_CANCER']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=16)

### Train Model

In [53]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(random_state = 16)
model.fit(X_train, y_train)
y_prediction = model.predict(X_test)

### Evaluation

In [54]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_test, y_prediction)
precision = precision_score(y_test, y_prediction)
recall = recall_score(y_test, y_prediction)
f1 = f1_score(y_test, y_prediction)
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1: {f1}")


Accuracy: 0.9230769230769231
Precision: 0.75
Recall: 0.6
F1: 0.6666666666666665


In [55]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_prediction)
print(f"Confusion Matrix: {cm}")

Confusion Matrix: [[ 6  4]
 [ 2 66]]


In [56]:
from sklearn.model_selection import cross_val_score

cross_val_accuracy = cross_val_score(model, X, y, cv = 5, scoring = 'accuracy')
print(f"Cross Validation Accuracy: {cross_val_accuracy.mean()}")

Cross Validation Accuracy: 0.9062400846113169


In [58]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(LogisticRegression(random_state=16), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X, y)
print("Best Parameters: ", grid_search.best_params_)

Best Parameters:  {'C': 1}


### Feature Selection (Optimizing the Model)

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state=16)
rfe = RFE(model, n_features_to_select=5)
rfe.fit(X, y)
print(rfe.support_)

### New Prediction with Fewer Features

In [None]:
X_new = X.drop(['SMOKING', 'YELLOW_FINGERS', 'ANXIETY', 'WHEEZING', 'SHORTNESS OF BREATH', 'CHEST PAIN'], axis = 1)
#X_new

In [None]:
from sklearn.model_selection import train_test_split

y = data['LUNG_CANCER']
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.25, random_state=16)

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(random_state = 16)
model.fit(X_train, y_train)
y_prediction = model.predict(X_test)

In [None]:
results = (y_test - y_prediction)**2
results.sum()