# <p style="background-color:#32936f;font-family:inter;color:#FFF9ED;font-size:150%;text-align:center;border-radius:10px 10px;">Lung Cancer Predictions</p>

# <span style="color:#32936f;"> Importing Libraries. </span>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
import plotly.express as px
import warnings
warnings.simplefilter("ignore")

In [None]:
data = pd.read_csv("survey lung cancer.csv")
data.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,1,2,2,1,1,2,1,2,2,2,2,2,2,YES
1,M,74,2,1,1,1,2,2,2,1,1,1,2,2,2,YES
2,F,59,1,1,1,2,1,2,1,2,1,2,2,1,2,NO
3,M,63,2,2,2,1,1,1,1,1,2,1,1,2,2,NO
4,F,63,1,2,1,1,1,1,1,2,1,2,2,1,1,NO


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309 entries, 0 to 308
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   GENDER                 309 non-null    object
 1   AGE                    309 non-null    int64 
 2   SMOKING                309 non-null    int64 
 3   YELLOW_FINGERS         309 non-null    int64 
 4   ANXIETY                309 non-null    int64 
 5   PEER_PRESSURE          309 non-null    int64 
 6   CHRONIC DISEASE        309 non-null    int64 
 7   FATIGUE                309 non-null    int64 
 8   ALLERGY                309 non-null    int64 
 9   WHEEZING               309 non-null    int64 
 10  ALCOHOL CONSUMING      309 non-null    int64 
 11  COUGHING               309 non-null    int64 
 12  SHORTNESS OF BREATH    309 non-null    int64 
 13  SWALLOWING DIFFICULTY  309 non-null    int64 
 14  CHEST PAIN             309 non-null    int64 
 15  LUNG_CANCER            

In [None]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
AGE,309.0,62.673139,8.210301,21.0,57.0,62.0,69.0,87.0
SMOKING,309.0,1.563107,0.496806,1.0,1.0,2.0,2.0,2.0
YELLOW_FINGERS,309.0,1.569579,0.495938,1.0,1.0,2.0,2.0,2.0
ANXIETY,309.0,1.498382,0.500808,1.0,1.0,1.0,2.0,2.0
PEER_PRESSURE,309.0,1.501618,0.500808,1.0,1.0,2.0,2.0,2.0
CHRONIC DISEASE,309.0,1.504854,0.500787,1.0,1.0,2.0,2.0,2.0
FATIGUE,309.0,1.673139,0.469827,1.0,1.0,2.0,2.0,2.0
ALLERGY,309.0,1.556634,0.497588,1.0,1.0,2.0,2.0,2.0
WHEEZING,309.0,1.556634,0.497588,1.0,1.0,2.0,2.0,2.0
ALCOHOL CONSUMING,309.0,1.556634,0.497588,1.0,1.0,2.0,2.0,2.0


In [None]:
data.isna().sum()

GENDER                   0
AGE                      0
SMOKING                  0
YELLOW_FINGERS           0
ANXIETY                  0
PEER_PRESSURE            0
CHRONIC DISEASE          0
FATIGUE                  0
ALLERGY                  0
WHEEZING                 0
ALCOHOL CONSUMING        0
COUGHING                 0
SHORTNESS OF BREATH      0
SWALLOWING DIFFICULTY    0
CHEST PAIN               0
LUNG_CANCER              0
dtype: int64

In [None]:
data["LUNG_CANCER"].unique()

array(['YES', 'NO'], dtype=object)

In [None]:
data["GENDER"].unique()

array(['M', 'F'], dtype=object)

In [None]:
# Mapping numeric values to non-numeric values

data['GENDER'] = data['GENDER'].map({'F': 0, 'M': 1})

data['LUNG_CANCER'] = data['LUNG_CANCER'].map({'NO': 0, 'YES': 1})

In [None]:
data.dtypes

GENDER                   int64
AGE                      int64
SMOKING                  int64
YELLOW_FINGERS           int64
ANXIETY                  int64
PEER_PRESSURE            int64
CHRONIC DISEASE          int64
FATIGUE                  int64
ALLERGY                  int64
WHEEZING                 int64
ALCOHOL CONSUMING        int64
COUGHING                 int64
SHORTNESS OF BREATH      int64
SWALLOWING DIFFICULTY    int64
CHEST PAIN               int64
LUNG_CANCER              int64
dtype: object

In [None]:
def custom_palette(custom_colors):
    customPalette = sns.set_palette(sns.color_palette(custom_colors))
    sns.palplot(sns.color_palette(custom_colors),size=0.8)
    plt.tick_params(axis='both', labelsize=0, length = 0)

In [None]:
print ('Total Healthy Patients : {} '.format(data.LUNG_CANCER.value_counts()[0]))
print ('Total Suspected Patients : {} '.format(data.LUNG_CANCER.value_counts()[1]))

Total Healthy Patients : 39 
Total Suspected Patients : 270 


In [None]:
x = data['LUNG_CANCER'].value_counts().index.tolist()
y = data['LUNG_CANCER'].value_counts().tolist()

fig = px.bar(x=x, y=y, color=["firebrick", "green"], color_discrete_map="identity",
             labels={
                'x': 'LUNG_CANCER',
                'y': 'count'
                },)
fig.show()

# <span style="color:#32936f;"> Splitting the data into training and test datasets </span>

In [None]:
# X data
X = data.drop("LUNG_CANCER", axis=1)
X.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN
0,1,69,1,2,2,1,1,2,1,2,2,2,2,2,2
1,1,74,2,1,1,1,2,2,2,1,1,1,2,2,2
2,0,59,1,1,1,2,1,2,1,2,1,2,2,1,2
3,1,63,2,2,2,1,1,1,1,1,2,1,1,2,2
4,0,63,1,2,1,1,1,1,1,2,1,2,2,1,1


In [None]:
# y data
y = data["LUNG_CANCER"]
y.head()

0    1
1    1
2    0
3    0
4    0
Name: LUNG_CANCER, dtype: int64

In [None]:
# Adding randomized samples to the data as the data is imbalanced

from imblearn.over_sampling import RandomOverSampler

over_samp =  RandomOverSampler(random_state=0)
X_train_res, y_train_res = over_samp.fit_resample(X, y)
X_train_res.shape, y_train_res.shape

((540, 15), (540,))

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_train_res, y_train_res, test_size = 0.2, random_state = 42)

In [None]:
len(X_train), len(X_test)

(432, 108)

In [None]:
# Scaling the data

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print("X_Train values is: \n",X_train)
print("\n")
print("X_Test values is: \n",X_test)

X_Train values is: 
 [[ 1.07703296  1.05462717 -1.11280815 ...  0.80942721 -0.688769
   1.1446224 ]
 [-0.92847669  0.20190189  0.8986275  ...  0.80942721 -0.688769
   1.1446224 ]
 [ 1.07703296  0.81099138  0.8986275  ... -1.23544154 -0.688769
   1.1446224 ]
 ...
 [-0.92847669  0.93280927  0.8986275  ...  0.80942721 -0.688769
  -0.87365056]
 [-0.92847669  0.56735558 -1.11280815 ...  0.80942721 -0.688769
  -0.87365056]
 [ 1.07703296  0.20190189  0.8986275  ...  0.80942721 -0.688769
   1.1446224 ]]


X_Test values is: 
 [[ 1.07703296 -0.6508234  -1.11280815 ...  0.80942721  1.45186558
   1.1446224 ]
 [-0.92847669 -0.28536971  0.8986275  ...  0.80942721 -0.688769
  -0.87365056]
 [ 1.07703296 -0.8944592   0.8986275  ...  0.80942721 -0.688769
   1.1446224 ]
 ...
 [ 1.07703296 -0.04173391  0.8986275  ...  0.80942721  1.45186558
   1.1446224 ]
 [-0.92847669 -0.4071876  -1.11280815 ...  0.80942721  1.45186558
  -0.87365056]
 [-0.92847669 -0.6508234   0.8986275  ...  0.80942721 -0.688769
  -0.87

# <span style="color:#32936f;"> Linear Regression </span>

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)

In [None]:
LinearRegressionScore = lr.score(X_test,y_test)
print("Accuracy obtained by Linear Regression model:",LinearRegressionScore*100)

Accuracy obtained by Linear Regression model: 64.04214644616874


# <span style="color:#32936f;"> Random Forest Classifier </span>

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 100)
rfc.fit(X_train,y_train)

In [None]:
RandomForestClassifierScore = rfc.score(X_test, y_test)
print("Accuracy obtained by Random Forest Classifier model:",RandomForestClassifierScore*100)

Accuracy obtained by Random Forest Classifier model: 99.07407407407408


# <span style="color:#32936f;"> K Neighbors Classifier </span>

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)

In [None]:
KNeighborsClassifierScore = knn.score(X_test, y_test)
print("Accuracy obtained by K Neighbors Classifier model:",KNeighborsClassifierScore*100)

Accuracy obtained by K Neighbors Classifier model: 93.51851851851852


In [None]:
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators = 100)
rfc.fit(X_train,y_train)

# Generate predictions using the trained model
y_pred_rfc = rfc.predict(X_test)

# Now you can use y_pred_rfc in the classification report
print(metrics.classification_report(y_test, y_pred_rfc))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99        51
           1       1.00      0.98      0.99        57

    accuracy                           0.99       108
   macro avg       0.99      0.99      0.99       108
weighted avg       0.99      0.99      0.99       108



In [None]:
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier

# Assuming 'knn' is already fitted as in your previous code
y_pred_knn = knn.predict(X_test)  # Generate predictions

print(metrics.classification_report(y_test, y_pred_knn))

              precision    recall  f1-score   support

           0       0.88      1.00      0.94        51
           1       1.00      0.88      0.93        57

    accuracy                           0.94       108
   macro avg       0.94      0.94      0.94       108
weighted avg       0.94      0.94      0.94       108



# <span style="color:#32936f;"> Decision Tree Classifier </span>

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

In [None]:
DecisionTreeClassifierScore = dtc.score(X_test,y_test)
print("Accuracy obtained by Decision Tree Classifier model:",DecisionTreeClassifierScore*100)

Accuracy obtained by Decision Tree Classifier model: 96.29629629629629


In [None]:
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier

# Assuming 'dtc' is already fitted as in your previous code
y_pred_dtc = dtc.predict(X_test)
print(metrics.classification_report(y_test, y_pred_dtc))

              precision    recall  f1-score   support

           0       0.93      1.00      0.96        51
           1       1.00      0.93      0.96        57

    accuracy                           0.96       108
   macro avg       0.96      0.96      0.96       108
weighted avg       0.97      0.96      0.96       108



# <span style="color:#32936f;"> Naive bayes </span>

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

# Calculate the accuracy
nb_accuracy = accuracy_score(y_test, y_pred)
print("Accuracy obtained by Naive Bayes model:", nb_accuracy * 100)


Accuracy obtained by Naive Bayes model: 95.6989247311828


In [None]:
print("Accuracy Obtained By trained Models: \n")
print("Linear Regression Score: ",LinearRegressionScore*100)
print("Random Forest Classifier Score: ",RandomForestClassifierScore*100)
print("K Neighbors Classifier Score: ",KNeighborsClassifierScore*100)
print("Decision Tree Classifier Score: ",DecisionTreeClassifierScore)
print("Naive Bayes Score: ",nb_accuracy*100)

Accuracy Obtained By trained Models: 

Linear Regression Score:  64.04214644616874
Random Forest Classifier Score:  99.07407407407408
K Neighbors Classifier Score:  93.51851851851852
Decision Tree Classifier Score:  0.9629629629629629
Naive Bayes Score:  95.6989247311828


In [None]:
results = f""" Accuracy Obtained By trained Models:

Linear Regression Score:  {64.04214644616876}
Random Forest Classifier Score:  {99.07407407407408}
K Neighbors Classifier Score:  {93.51851851851852}
Decision Tree Classifier Score:  {0.9629629629629629}
Naive Bayes Score:  {95.69892473118287} """

with open('results.txt', 'w') as f:
    f.write(results)

In [None]:
from sklearn.metrics import classification_report, precision_recall_fscore_support
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import r2_score

# Assuming y_pred_* variables contain the predictions for each model
y_pred_lr = lr.predict(X_test)
y_pred_rfc = rfc.predict(X_test)
y_pred_knn = knn.predict(X_test)
y_pred_dtc = dtc.predict(X_test)

# Calculate evaluation metrics for each model
mse = mean_squared_error(y_test, y_pred_lr)
rmse = mean_squared_error(y_test, y_pred_lr, squared=False)
mae = mean_absolute_error(y_test, y_pred_lr)
r2 = r2_score(y_test, y_pred_lr)

# Print the evaluation metrics for each model
rfc_report = classification_report(y_test, y_pred_rfc)
knn_report = classification_report(y_test, y_pred_knn)
dtc_report = classification_report(y_test, y_pred_dtc)
nb_report = classification_report(y_test, y_pred)


results = f"""
Accuracy Obtained By trained Models:

Linear Regression Score: \nMSE: {mse}, \nRMSE: {rmse}, \nMAE: {mae}, \nR2: {r2}


Random Forest Classifier Score: {RandomForestClassifierScore*100}
\n{rfc_report}

K Neighbors Classifier Score: {KNeighborsClassifierScore*100}
\n{knn_report}

Decision Tree Classifier Score: {DecisionTreeClassifierScore}
\n{dtc_report}

Naive Bayes Score: {nb_accuracy*100}
\n{nb_report}
"""

with open('results.txt', 'w') as f:
    f.write(results)