In [1]:
import pandas as pd
import numpy as np
# 1: Read Dataset from UCI machine learning repository

data = pd.read_csv("E:\PROJECTS\Glioma Grading Clinical and Mutation Features Dataset\glioma+grading+clinical+and+mutation+features+dataset\TCGA_InfoWithGrade.csv")

data.head()

Unnamed: 0,Grade,Gender,Age_at_diagnosis,Race,IDH1,TP53,ATRX,PTEN,EGFR,CIC,...,FUBP1,RB1,NOTCH1,BCOR,CSMD3,SMARCA4,GRIN2A,IDH2,FAT4,PDGFRA
0,0,0,51.3,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,0,38.72,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,35.17,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,32.78,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,31.51,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [2]:

# 2:Apply Data Preprocessing
# Handling missing data
data = data.dropna()


In [3]:
from sklearn.preprocessing import StandardScaler
# Feature scaling
scaler = StandardScaler()
data[['Age_at_diagnosis']] = scaler.fit_transform(data[['Age_at_diagnosis']])
data.head()

Unnamed: 0,Grade,Gender,Age_at_diagnosis,Race,IDH1,TP53,ATRX,PTEN,EGFR,CIC,...,FUBP1,RB1,NOTCH1,BCOR,CSMD3,SMARCA4,GRIN2A,IDH2,FAT4,PDGFRA
0,0,0,0.023233,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,0,-0.7784,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,-1.004616,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,-1.156913,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,-1.237841,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
from sklearn.model_selection import train_test_split
# Splitting data into training and testing sets
X = data.drop(['Grade'], axis=1)
y = data['Grade']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
data.head()


Unnamed: 0,Grade,Gender,Age_at_diagnosis,Race,IDH1,TP53,ATRX,PTEN,EGFR,CIC,...,FUBP1,RB1,NOTCH1,BCOR,CSMD3,SMARCA4,GRIN2A,IDH2,FAT4,PDGFRA
0,0,0,0.023233,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,0,-0.7784,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,-1.004616,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,-1.156913,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,-1.237841,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# Scaling the features using standard scaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [6]:
# 3: Apply minimum 3 classification models


from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.model_selection import GridSearchCV

# Logistic Regression
lr = LogisticRegression(random_state=42)
lr.fit(X_train, y_train)

# Make predictions on the test data
lr_pred = lr.predict(X_test)

# Calculate the accuracy score
accuracy = accuracy_score(y_test, lr_pred)
print('Logistic Regression Accuracy:', accuracy)
lr_f1_score = f1_score(y_test, lr_pred, average='weighted')

lr_precision = precision_score(y_test, lr_pred, average='weighted')
print('Logistic Regression Precision:', lr_precision)
lr_recall = recall_score(y_test, lr_pred, average='weighted')
print('Logistic Regression Recall:', lr_recall)

print('Logistic Regression F1 Score:', lr_f1_score)



Logistic Regression Accuracy: 0.8452380952380952
Logistic Regression Precision: 0.8492519613209268
Logistic Regression Recall: 0.8452380952380952
Logistic Regression F1 Score: 0.8453258411798453


In [7]:
#Hyperparameter Tuning: Tuning Model 1: Logistic Regression
from sklearn.model_selection import GridSearchCV

# Define the parameter grid to search over
param_grid = {'C': [0.1, 1, 10], 'penalty': ['l2']}

# Create the GridSearchCV object
grid_search = GridSearchCV(lr, param_grid=param_grid, cv=5, error_score='raise')

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print('Best Parameters:', grid_search.best_params_)
print('Logistic Regression(tuned):', grid_search.best_score_)


Best Parameters: {'C': 0.1, 'penalty': 'l2'}
Logistic Regression(tuned): 0.8717965726920951


In [8]:
from sklearn.tree import DecisionTreeClassifier

# Decision Tree Classifier
dtc = DecisionTreeClassifier(max_depth=10, random_state=42)
dtc.fit(X_train, y_train)

dt_pred = dtc.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_pred)
dt_precision = precision_score(y_test, dt_pred, average='weighted')
dt_recall = recall_score(y_test, dt_pred, average='weighted')
dt_f1_score = f1_score(y_test, dt_pred, average='weighted')



print('Decision Tree Accuracy:', dt_accuracy)
print('Decision Tree Precision:', dt_precision)
print('Decision Tree Recall:', dt_recall)
print('Decision Tree F1 Score:', dt_f1_score)


Decision Tree Accuracy: 0.8095238095238095
Decision Tree Precision: 0.8152806279673014
Decision Tree Recall: 0.8095238095238095
Decision Tree F1 Score: 0.8095238095238095


In [9]:
#Tuning Model 2: Decision Tree Classifier

param_grid = {'max_depth': [1, 2, 3, 4, 5],
              'min_samples_split': [2, 3, 4, 5],
              'min_samples_leaf': [1, 2, 3, 4, 5]}

# Create the GridSearchCV object
dtc_cv = GridSearchCV(dtc, param_grid=param_grid, cv=5)

# Fit the GridSearchCV object to the data
dtc_cv.fit(X_train, y_train)

# Print the best hyperparameters found
print("Best Hyperparameters:", dtc_cv.best_params_)

# Get the score of the best estimator on the test set
dtc_score = dtc_cv.best_estimator_.score(X_test, y_test)
print("Decision Tree Classifier Score(tuned):", dtc_score)




Best Hyperparameters: {'max_depth': 2, 'min_samples_leaf': 2, 'min_samples_split': 2}
Decision Tree Classifier Score(tuned): 0.8571428571428571


In [10]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rfc.fit(X_train, y_train)

rf_pred = rfc.predict(X_test)

rf_accuracy = accuracy_score(y_test, rf_pred)
rf_precision = precision_score(y_test, rf_pred, average='weighted')
rf_recall = recall_score(y_test, rf_pred, average='weighted')
rf_f1_score = f1_score(y_test, rf_pred, average='weighted')

print('Random Forest Accuracy:', rf_accuracy)
print('Random Forest Precision:', rf_precision)
print('Random Forest Recall:', rf_recall)
print('Random Forest F1 Score:', rf_f1_score)


Random Forest Accuracy: 0.8392857142857143
Random Forest Precision: 0.8424408475812333
Random Forest Recall: 0.8392857142857143
Random Forest F1 Score: 0.8394053315105946


In [12]:
#Tuning Model 3: Random Forest Classifier

from sklearn.model_selection import GridSearchCV


# Grid Search for Random Forest Classifier
param_grid = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(rfc, param_grid=param_grid)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_score_rfc = grid_search.best_score_

print(f'Best Parameters: {best_params}')
print(f'Random Forest Classifier(tuned): {best_score_rfc}')


Best Parameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 50}
Random Forest Classifier(tuned): 0.8688225538971807


In [13]:

# 5: Compare the results
results = pd.DataFrame({
    'Model': ['Logistic Regression(Tuned)', 'Decision Tree Classifier(Tuned)', 'Random Forest Classifier(Tuned)'],
    'Score': [grid_search.best_score_, dtc_score,best_score_rfc]
})
results.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
0,Logistic Regression(Tuned),0.868823
2,Random Forest Classifier(Tuned),0.868823
1,Decision Tree Classifier(Tuned),0.857143
