In [None]:
!pip install imbalanced-learn

In [None]:
!pip install -U keras-tuner

In [None]:
!pip install keras

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
import catboost as cb
import keras
from keras.models import Sequential
from keras.layers import Dense, LeakyReLU, PReLU, ELU, Dropout, Activation
from tensorflow import keras
from tensorflow.keras import layers
from keras_tuner import RandomSearch
import keras_tuner
from keras.wrappers.scikit_learn import KerasClassifier

# Reading input data into dataframe

In [None]:
df= pd.read_csv('../input/smoking-related-lung-cancers/lung_cancer.csv')

In [None]:
df

# Understanding the data

# 1- we see that race feature has some missing values
# 2- we see that features i.e. days_to_cancer and stage_of_cancer are imbalanced

In [None]:
df.info()

# Creating a custom dataframe to see the relationship of target features with each other

In [None]:
df_sub_no_na = df[(df['days_to_cancer'].isna() == False) & (df['stage_of_cancer'].isna() == False)]

# We can see that both the target features contain some non null values in corresponsing rows

In [None]:
df_sub_no_na

# Preprocessing Categorical features

In [None]:
df['gender'].unique()

In [None]:
df['race'].unique()

In [None]:
df['race'].value_counts()

In [None]:
df['smoker'].unique()

# As all the categorical featuers contain nominal data, therefore One Hot Encoding has been applied

In [None]:
gender = pd.get_dummies(df['gender'], drop_first=True)
race = pd.get_dummies(df['race'], drop_first=True)
smoker = pd.get_dummies(df['smoker'], drop_first=True)

In [None]:
df = pd.concat([df, gender, race, smoker], axis=1)

In [None]:
df.drop(['gender', 'race', 'smoker'], axis=1, inplace=True)

In [None]:
df

# Reseting the index of the resultant dataframe after applying One Hot Encoding

In [None]:
df.reset_index(drop=True, inplace=True)

In [None]:
df

# Dropping the pid feature as it is not needed

In [None]:
df.drop('pid', inplace=True, axis=1)

In [None]:
df

# Combining the two features days_to_cancer and stage_of_cancer to make a new target feature 'cancer' containing only binary values. 1 represents a person who has cancer and 0 who does not

In [None]:
cancer_list = list()
days_cancer = list()

In [None]:
for index in df.index:
    
    if pd.isna(df['days_to_cancer'][index]) == True:
        days_cancer.append(0)
    if pd.isna(df['stage_of_cancer'][index]) == True:
        cancer_list.append(0)
    if pd.isna(df['stage_of_cancer'][index]) == False:
        cancer_list.append(1)
    if pd.isna(df['days_to_cancer'][index]) == False:
        days_cancer.append(df['days_to_cancer'][index])

In [None]:
df['cancer'] = cancer_list
df['days_cancer'] = days_cancer

In [None]:
df.info()

In [None]:
df

In [None]:
df.describe(include='all')

# Dropping the days_to_cancer and stage_of_cancer feature as they are no longer needed

In [None]:
df.drop(['days_to_cancer', 'stage_of_cancer'], axis=1, inplace=True)

In [None]:
df

In [None]:
df.info()

# Preprocessing Numerical Features i.e. age

# Plotting a histogram to check the distribution of data

In [None]:
df.hist(['age', 'days_cancer'])

# We can conclude that age and days_cancer feature does not follow normal distribution

# We can now normalize the column age and days_cancer

# Feture Scaling

# Applying Normalization to age and days_cancer

In [None]:
min_max_scaler = MinMaxScaler()

min_max_scaled_data = min_max_scaler.fit_transform(df[['age', 'days_cancer']])

# Replacing the old values in the dataframe

In [None]:
df_normalized = pd.DataFrame(min_max_scaled_data, columns=[['age', 'days_cancer']])

In [None]:
df[['age', 'days_cancer']] = df_normalized[['age', 'days_cancer']]

In [None]:
df

In [None]:
df['days_cancer'].unique()

In [None]:
df.isnull().sum()

# Splitting the data into training and testing data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('cancer',axis=1), df['cancer'], test_size=0.3)

In [None]:
X_train.shape, X_test.shape

In [None]:
y_train.shape

# Under Sampling

In [None]:
from collections import Counter
from imblearn.under_sampling import NearMiss

near_miss = NearMiss(0.8)

X_train_nm, y_train_nm = near_miss.fit_resample(X_train, y_train)

print('Before Sampling {}'.format(Counter(y_train)))
print('After Sampling {}'.format(Counter(y_train_nm)))

In [None]:
rand_for_classifier = RandomForestClassifier()
rand_for_classifier.fit(X_train_nm, y_train_nm)

y_pred_rf_clf = rand_for_classifier.predict(X_test)

print(confusion_matrix(y_pred_rf_clf, y_test))
print(classification_report(y_pred_rf_clf, y_test))

# We can see that the data is unbalanced for the target feature

In [None]:
df['cancer'].value_counts()

In [None]:
count_class_0, count_class_1 = df['cancer'].value_counts()
count_class_0, count_class_1

# Separating the data for 0 and 1 classes

In [None]:
df_class_0 = df[df['cancer'] == 0]
df_class_1 = df[df['cancer'] == 1]

In [None]:
df_class_0.shape, df_class_1.shape

# Creating a dataframe with equal number of target column values

In [None]:
df_class_0_under = df_class_0.sample(count_class_1)

df_test_under = pd.concat([df_class_0_under, df_class_1], axis=0)

df_test_under.shape

In [None]:
df_test_under['cancer'].value_counts()

In [None]:
X = df_test_under.drop('cancer', axis=1)
y = df_test_under['cancer']

# Separating the train and test features

In [None]:
# stratify = y will ensure that the training and testing samples are balanced for both classes
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15, stratify=y)

In [None]:
# both the classes are balanced because we set stratify = y
y_train.value_counts()

# Model Training

# Random Forest Classifier

In [None]:
rand_forest = RandomForestClassifier()

model_randforest = rand_forest.fit(X_train, y_train)

y_pred = model_randforest.predict(X_test)

# F1 Score on Random Forest Classifier

In [None]:
f1_score(y_test, y_pred)

# Logistic Regression

In [None]:
logisticRegr = LogisticRegression(max_iter=2000)

model = logisticRegr.fit(X_train, y_train)

y_pred_lr = model.predict(X_test)

# F1 Score on Logisitic Regression

In [None]:
f1_score(y_test, y_pred_lr)

# Decision Tree Classifier

In [None]:
d_tree= DecisionTreeClassifier(max_depth =3, random_state = 42)

model_tree = d_tree.fit(X_train, y_train)

In [None]:
y_pred_dtree = model_tree.predict(X_test)

# F1 Score on Decision Tree Classifier

In [None]:
f1_score(y_test, y_pred_dtree)

# Cat Boost Classifier

In [None]:
cat_boost = cb.CatBoostClassifier(loss_function='CrossEntropy', iterations=5000, od_wait=100, od_type='Iter', depth=3)

model_cb = cat_boost.fit(X_train, y_train)

In [None]:
y_pred_cb = model_cb.predict(X_test)

# F1 Score on Cat Boost Classifier

In [None]:
f1_score(y_pred_cb, y_test)

# Over Sampling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('cancer',axis=1), df['cancer'], test_size=0.3)

In [None]:
from imblearn.over_sampling import RandomOverSampler

os = RandomOverSampler(0.7)

X_train_os, y_train_os = os.fit_resample(X_train, y_train)

print('Before Sampling {}'.format(Counter(y_train)))
print('After Sampling {}'.format(Counter(y_train_os)))


In [None]:
rand_for_classifier = RandomForestClassifier()
rand_for_classifier.fit(X_train_os, y_train_os)

y_pred_rf_clf = rand_for_classifier.predict(X_test)

print(confusion_matrix(y_pred_rf_clf, y_test))
print(classification_report(y_pred_rf_clf, y_test))

In [None]:
count_class_0, count_class_1

# Creating a dataframe with equal number of target column values

In [None]:
df_class_1_over = df_class_1.sample(count_class_0, replace=True)

df_test_over = pd.concat([df_class_0, df_class_1_over], axis=0)

df_test_over['cancer'].value_counts()

In [None]:
df_test_over.shape

In [None]:
X = df_test_over.drop('cancer', axis=1)
y = df_test_over['cancer']

In [None]:
X.shape, y.shape

# Separating the train and test features

In [None]:
# stratify = y will ensure that the training and testing samples are balanced for both classes
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15, stratify=y)

# Cat Boost Classifier

In [None]:
cat_boost = cb.CatBoostClassifier(loss_function='CrossEntropy', iterations=5000, od_wait=100, od_type='Iter', depth=3)

model_cb = cat_boost.fit(X_train, y_train)

In [None]:
y_pred_cb = model_cb.predict(X_test)

# F1 Score on Cat Boost Classifier

In [None]:
f1_score(y_pred_cb, y_test)

# Decision Tree Classifier

In [None]:
d_tree= DecisionTreeClassifier(max_depth =3, random_state = 42)

model_tree = d_tree.fit(X_train, y_train)

y_pred_dtree = model_tree.predict(X_test)

# F1 Score on Decision Tree Classifier

In [None]:
f1_score(y_pred_dtree, y_test)

# Logisitic Regression

In [None]:
logisticRegr = LogisticRegression(max_iter=2000)

model = logisticRegr.fit(X_train, y_train)

y_pred_lr = model.predict(X_test)

# F1 Score on Logistic Regression

In [None]:
f1_score(y_pred_lr, y_test)

# Random Forest Classifier

In [None]:
rand_forest = RandomForestClassifier()

model_randforest = rand_forest.fit(X_train, y_train)

y_pred = model_randforest.predict(X_test)

# F1 Score on Random Forest Classifier

In [None]:
f1_score(y_pred, y_test)

# SMOTE

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('cancer', axis=1), df['cancer'], test_size=0.3, random_state=0)

In [None]:
from imblearn.combine import SMOTETomek

st = SMOTETomek(0.75)

X_train_st, y_train_st = st.fit_resample(X_train, y_train)

print('Before Sampling {}'.format(Counter(y_train)))
print('After Sampling {}'.format(Counter(y_train_st)))

In [None]:
rand_for_classifier = RandomForestClassifier()
rand_for_classifier.fit(X_train_st, y_train_st)

y_pred_rf_clf = rand_for_classifier.predict(X_test)

print(confusion_matrix(y_pred_rf_clf, y_test))
print(classification_report(y_pred_rf_clf, y_test))


In [None]:
X = df.drop('cancer', axis=1)
y = df['cancer']

# Creating SMOTE Features for both independent and dependent feature

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='minority')

X_smote, y_smote = smote.fit_resample(X,y)

In [None]:
X_smote.shape, y_smote.shape

In [None]:
y_smote.value_counts()

# Spliting the data in training and testing data

In [None]:
# stratify = y will ensure that the training and testing samples are balanced for both classes
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=0.3, random_state=15, stratify=y_smote)

In [None]:
'''
def build_model(hp):
    model = keras.Sequential()
    
    for i in range(hp.Int('num_layers', 2, 20)):
        model.add(keras.layers.Dense(units=hp.Int('units_' + str(i),
                                           min_value=32,
                                           max_value=512,
                                           step=32),
                              activation='relu'))
    model.add(keras.layers.Dense(1, activation='linear'))
    model.compile(
        optimizer=keras.optimizers.Adam(
            hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])),
        loss='binary_crossentropy',
        metrics=['binary_crossentropy'])
    return model'''

In [None]:
'''tuner = keras_tuner.RandomSearch(
    build_model,
    objective='val_loss',
    max_trials=5)'''

In [None]:
#tuner.search(X_train, y_train, epochs=5, validation_data=(X_test, y_test))

In [None]:
#tuner.results_summary()

In [None]:
'''
def create_model(layers, activation):
    
    model = Sequential()
    
    for i, nodes in enumerate(layers):
        if i == 0:
            model.add(Dense(nodes, input_dim=X_train.shape[1]))
            model.add(Activation(activation))
            model.add(Dropout(0.3))
        else:
            model.add(Dense(nodes))
            model.add(Activation(activation))
            model.add(Dropout(0.3))
            
    model.add(Dense(units=1, kernel_initializer='glorot_uniform', activation='sigmoid'))
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model'''

In [None]:
'''model = KerasClassifier(build_fn=create_model, verbose=0)

layers = [[20], [40, 20], [45, 30, 35]]
activation = ['sigmoid', 'relu']
param_grid = dict(layers=layers, activation=activation, batch_size=[128,256], epochs=[30])

grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)'''

In [None]:
#grid_result = grid.fit(X_train, y_train)

In [None]:
#y_pred = grid.predict(X_test)
#print(confusion_matrix(y_pred, y_test))

In [None]:
#print(grid_result.best_score_, grid_result.best_params_)

In [None]:
#f1_score(y_pred, y_test)

In [None]:
def ann(X_train, X_test, y_train, y_test):
    model = keras.Sequential(
            [
                keras.layers.Dense(X_train.shape[1], input_dim=X_train.shape[1], activation='relu'),
                keras.layers.Dense(45, activation='relu'),
                keras.layers.Dense(30, activation='relu'),
                keras.layers.Dense(35, activation='relu'),
                keras.layers.Dense(1, kernel_initializer='glorot_uniform', activation='sigmoid'),
            ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
model = ann(X_train, X_test, y_train, y_test)

In [None]:
model.fit(X_train, y_train, epochs=30)

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_pred = (y_pred>0.5)
f1_score(y_pred, y_test)

# Random Forest Classifier

In [None]:
rand_forest = RandomForestClassifier(random_state=68, max_samples=0.5)

model_randforest = rand_forest.fit(X_train, y_train)

y_pred = model_randforest.predict(X_test)

In [None]:
y_pred_train = model_randforest.predict(X_train)

# F1 Score on Random Forest Classifier

In [None]:
f1_score(y_pred_train, y_train)

In [None]:
f1_score(y_pred, y_test)

# Decision Tree Classifier

In [None]:
d_tree= DecisionTreeClassifier()

model_tree = d_tree.fit(X_train, y_train)

y_pred_dtree = model_tree.predict(X_test)

# F1 Score on Decision Tree Classifier

In [None]:
f1_score(y_pred_dtree, y_test)

# Cat Boost Classifier

In [None]:
cat_boost = cb.CatBoostClassifier(loss_function='CrossEntropy', iterations=5000, od_wait=100, od_type='Iter', depth=3)

model_cb = cat_boost.fit(X_train, y_train)

In [None]:
y_pred_cb = model_cb.predict(X_test)

# F1 Score on Cat Boost Classifier

In [None]:
f1_score(y_pred_cb, y_test)